Commit 46dcf591501d06a28dbb2113799fb586b3f50c9f
1 parent
1c8a523a
Exists in
master
and in
5 other branches
Improved image crawler, now gets more images [Issue:#198]
Showing
1 changed file
with
19 additions
and
6 deletions
Show diff stats
links/image_crawler.py
1 | 1 | from bs4 import BeautifulSoup |
2 | 2 | from urllib.request import urlopen |
3 | +from urllib.parse import urlparse | |
3 | 4 | import urllib.request |
4 | 5 | |
5 | 6 | def make_soup(url): |
... | ... | @@ -20,6 +21,8 @@ def get_images(url,slug): |
20 | 21 | return ("Use default image",downloaded) |
21 | 22 | images = [img for img in soup.findAll('img')] |
22 | 23 | image_links = [each.get('src') for each in images] |
24 | + parsed_uri = urlparse(url) | |
25 | + domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) | |
23 | 26 | link_slug = slug |
24 | 27 | filename = '' |
25 | 28 | for each in image_links: |
... | ... | @@ -46,15 +49,25 @@ def get_images(url,slug): |
46 | 49 | filename = '.jpeg' |
47 | 50 | if not booleano: |
48 | 51 | continue |
49 | - | |
50 | - if each[0] + each[1] == '//' or each[0] == '/': | |
51 | - each = 'http:'+each | |
52 | - if each[0:4] != 'http' and each[0:5] != 'https': | |
53 | - each = url[0:url.index('/',8)] + each | |
54 | 52 | caminho = "links/static/images/" |
55 | 53 | try: |
56 | 54 | urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename) |
57 | 55 | downloaded = True |
58 | 56 | except Exception: |
59 | - continue | |
57 | + try: | |
58 | + aux = domain + each | |
59 | + urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename) | |
60 | + downloaded = True | |
61 | + except Exception as e: | |
62 | + try: | |
63 | + aux2 = url[0:url.index('/',8)] + each | |
64 | + urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename) | |
65 | + downloaded = True | |
66 | + except Exception as e: | |
67 | + try: | |
68 | + aux3 = 'http:' + each | |
69 | + urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename) | |
70 | + downloaded = True | |
71 | + except Exception as e: | |
72 | + continue | |
60 | 73 | return filename,downloaded | ... | ... |