Commit 46dcf591501d06a28dbb2113799fb586b3f50c9f

Authored by Gustavo Bernardo
1 parent 1c8a523a

Improved image crawler, now gets more images [Issue:#198]

Showing 1 changed file with 19 additions and 6 deletions   Show diff stats
links/image_crawler.py
1 1 from bs4 import BeautifulSoup
2 2 from urllib.request import urlopen
  3 +from urllib.parse import urlparse
3 4 import urllib.request
4 5  
5 6 def make_soup(url):
... ... @@ -20,6 +21,8 @@ def get_images(url,slug):
20 21 return ("Use default image",downloaded)
21 22 images = [img for img in soup.findAll('img')]
22 23 image_links = [each.get('src') for each in images]
  24 + parsed_uri = urlparse(url)
  25 + domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
23 26 link_slug = slug
24 27 filename = ''
25 28 for each in image_links:
... ... @@ -46,15 +49,25 @@ def get_images(url,slug):
46 49 filename = '.jpeg'
47 50 if not booleano:
48 51 continue
49   -
50   - if each[0] + each[1] == '//' or each[0] == '/':
51   - each = 'http:'+each
52   - if each[0:4] != 'http' and each[0:5] != 'https':
53   - each = url[0:url.index('/',8)] + each
54 52 caminho = "links/static/images/"
55 53 try:
56 54 urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
57 55 downloaded = True
58 56 except Exception:
59   - continue
  57 + try:
  58 + aux = domain + each
  59 + urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename)
  60 + downloaded = True
  61 + except Exception as e:
  62 + try:
  63 + aux2 = url[0:url.index('/',8)] + each
  64 + urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename)
  65 + downloaded = True
  66 + except Exception as e:
  67 + try:
  68 + aux3 = 'http:' + each
  69 + urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename)
  70 + downloaded = True
  71 + except Exception as e:
  72 + continue
60 73 return filename,downloaded
... ...