Commit 114cf48a40eec70a5a1fe388c12154cea67bf518
1 parent
a482e6e9
Exists in
master
and in
5 other branches
Adding image crawler [Issue:#198]
Showing
1 changed file
with
56 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,56 @@ |
1 | +''' | |
2 | +from bs4 import BeautifulSoup | |
3 | +from urllib.request import urlopen | |
4 | +import urllib.request | |
5 | + | |
6 | + | |
7 | +def make_soup(url): | |
8 | + try: | |
9 | + html = urlopen(url).read() | |
10 | + return BeautifulSoup(html,"lxml") | |
11 | + except urllib.error.HTTPError as e: | |
12 | + return "Use default image" | |
13 | + | |
14 | +def get_images(url): | |
15 | + try: | |
16 | + soup = make_soup(url) | |
17 | + except: | |
18 | + return("Use default image") | |
19 | + if soup == None or type(soup) == str: | |
20 | + return "Use default image" | |
21 | + images = [img for img in soup.findAll('img')] | |
22 | + image_links = [each.get('src') for each in images] | |
23 | + contador = 0 | |
24 | + for each in image_links: | |
25 | + booleano = False | |
26 | + if each != "": | |
27 | + if each == None: | |
28 | + continue | |
29 | + if 'jpg' in each: | |
30 | + booleano = True | |
31 | + pos = each.index("jpg") | |
32 | + each = each[0:pos+3] | |
33 | + elif 'png' in each: | |
34 | + booleano = True | |
35 | + pos = each.index("png") | |
36 | + each = each[0:pos+3] | |
37 | + elif 'jpeg' in each: | |
38 | + booleano = True | |
39 | + pos = each.index('jpeg') | |
40 | + each = each[0:pos+4] | |
41 | + if not booleano: | |
42 | + continue | |
43 | + | |
44 | + if each[0] + each[1] == '//' or each[0] == '/': | |
45 | + each = 'http:'+each | |
46 | + if each[0:4] != 'http' and each[0:5] != 'https': | |
47 | + each = url[0:url.index('/',8)] + each | |
48 | + contador += 1 | |
49 | + caminho = "" | |
50 | + filename=each.split('/')[-1] | |
51 | + try: | |
52 | + urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename) | |
53 | + except Exception: | |
54 | + continue | |
55 | + | |
56 | +''' | ... | ... |