From 46dcf591501d06a28dbb2113799fb586b3f50c9f Mon Sep 17 00:00:00 2001 From: Gustavo Bernardo Date: Sat, 26 Nov 2016 00:05:17 -0300 Subject: [PATCH] Improved image crawler, now gets more images [Issue:#198] --- links/image_crawler.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/links/image_crawler.py b/links/image_crawler.py index bd36930..8551a2b 100644 --- a/links/image_crawler.py +++ b/links/image_crawler.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from urllib.request import urlopen +from urllib.parse import urlparse import urllib.request def make_soup(url): @@ -20,6 +21,8 @@ def get_images(url,slug): return ("Use default image",downloaded) images = [img for img in soup.findAll('img')] image_links = [each.get('src') for each in images] + parsed_uri = urlparse(url) + domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) link_slug = slug filename = '' for each in image_links: @@ -46,15 +49,25 @@ def get_images(url,slug): filename = '.jpeg' if not booleano: continue - - if each[0] + each[1] == '//' or each[0] == '/': - each = 'http:'+each - if each[0:4] != 'http' and each[0:5] != 'https': - each = url[0:url.index('/',8)] + each caminho = "links/static/images/" try: urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename) downloaded = True except Exception: - continue + try: + aux = domain + each + urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename) + downloaded = True + except Exception as e: + try: + aux2 = url[0:url.index('/',8)] + each + urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename) + downloaded = True + except Exception as e: + try: + aux3 = 'http:' + each + urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename) + downloaded = True + except Exception as e: + continue return filename,downloaded -- libgit2 0.21.2