Commit 46dcf591501d06a28dbb2113799fb586b3f50c9f

Authored by Gustavo Bernardo
1 parent 1c8a523a

Improved image crawler, now gets more images [Issue:#198]

Showing 1 changed file with 19 additions and 6 deletions   Show diff stats
links/image_crawler.py
1 from bs4 import BeautifulSoup 1 from bs4 import BeautifulSoup
2 from urllib.request import urlopen 2 from urllib.request import urlopen
  3 +from urllib.parse import urlparse
3 import urllib.request 4 import urllib.request
4 5
5 def make_soup(url): 6 def make_soup(url):
@@ -20,6 +21,8 @@ def get_images(url,slug): @@ -20,6 +21,8 @@ def get_images(url,slug):
20 return ("Use default image",downloaded) 21 return ("Use default image",downloaded)
21 images = [img for img in soup.findAll('img')] 22 images = [img for img in soup.findAll('img')]
22 image_links = [each.get('src') for each in images] 23 image_links = [each.get('src') for each in images]
  24 + parsed_uri = urlparse(url)
  25 + domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
23 link_slug = slug 26 link_slug = slug
24 filename = '' 27 filename = ''
25 for each in image_links: 28 for each in image_links:
@@ -46,15 +49,25 @@ def get_images(url,slug): @@ -46,15 +49,25 @@ def get_images(url,slug):
46 filename = '.jpeg' 49 filename = '.jpeg'
47 if not booleano: 50 if not booleano:
48 continue 51 continue
49 -  
50 - if each[0] + each[1] == '//' or each[0] == '/':  
51 - each = 'http:'+each  
52 - if each[0:4] != 'http' and each[0:5] != 'https':  
53 - each = url[0:url.index('/',8)] + each  
54 caminho = "links/static/images/" 52 caminho = "links/static/images/"
55 try: 53 try:
56 urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename) 54 urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
57 downloaded = True 55 downloaded = True
58 except Exception: 56 except Exception:
59 - continue 57 + try:
  58 + aux = domain + each
  59 + urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename)
  60 + downloaded = True
  61 + except Exception as e:
  62 + try:
  63 + aux2 = url[0:url.index('/',8)] + each
  64 + urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename)
  65 + downloaded = True
  66 + except Exception as e:
  67 + try:
  68 + aux3 = 'http:' + each
  69 + urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename)
  70 + downloaded = True
  71 + except Exception as e:
  72 + continue
60 return filename,downloaded 73 return filename,downloaded