Commit 46dcf591501d06a28dbb2113799fb586b3f50c9f
1 parent
1c8a523a
Exists in
master
and in
5 other branches
Improved image crawler, now gets more images [Issue:#198]
Showing
1 changed file
with
19 additions
and
6 deletions
Show diff stats
links/image_crawler.py
1 | from bs4 import BeautifulSoup | 1 | from bs4 import BeautifulSoup |
2 | from urllib.request import urlopen | 2 | from urllib.request import urlopen |
3 | +from urllib.parse import urlparse | ||
3 | import urllib.request | 4 | import urllib.request |
4 | 5 | ||
5 | def make_soup(url): | 6 | def make_soup(url): |
@@ -20,6 +21,8 @@ def get_images(url,slug): | @@ -20,6 +21,8 @@ def get_images(url,slug): | ||
20 | return ("Use default image",downloaded) | 21 | return ("Use default image",downloaded) |
21 | images = [img for img in soup.findAll('img')] | 22 | images = [img for img in soup.findAll('img')] |
22 | image_links = [each.get('src') for each in images] | 23 | image_links = [each.get('src') for each in images] |
24 | + parsed_uri = urlparse(url) | ||
25 | + domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) | ||
23 | link_slug = slug | 26 | link_slug = slug |
24 | filename = '' | 27 | filename = '' |
25 | for each in image_links: | 28 | for each in image_links: |
@@ -46,15 +49,25 @@ def get_images(url,slug): | @@ -46,15 +49,25 @@ def get_images(url,slug): | ||
46 | filename = '.jpeg' | 49 | filename = '.jpeg' |
47 | if not booleano: | 50 | if not booleano: |
48 | continue | 51 | continue |
49 | - | ||
50 | - if each[0] + each[1] == '//' or each[0] == '/': | ||
51 | - each = 'http:'+each | ||
52 | - if each[0:4] != 'http' and each[0:5] != 'https': | ||
53 | - each = url[0:url.index('/',8)] + each | ||
54 | caminho = "links/static/images/" | 52 | caminho = "links/static/images/" |
55 | try: | 53 | try: |
56 | urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename) | 54 | urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename) |
57 | downloaded = True | 55 | downloaded = True |
58 | except Exception: | 56 | except Exception: |
59 | - continue | 57 | + try: |
58 | + aux = domain + each | ||
59 | + urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename) | ||
60 | + downloaded = True | ||
61 | + except Exception as e: | ||
62 | + try: | ||
63 | + aux2 = url[0:url.index('/',8)] + each | ||
64 | + urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename) | ||
65 | + downloaded = True | ||
66 | + except Exception as e: | ||
67 | + try: | ||
68 | + aux3 = 'http:' + each | ||
69 | + urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename) | ||
70 | + downloaded = True | ||
71 | + except Exception as e: | ||
72 | + continue | ||
60 | return filename,downloaded | 73 | return filename,downloaded |