Improved image crawler, now gets more images [Issue:#198]

Gustavo Bernardo
1 parent 1c8a523a
Showing 1 changed file with 19 additions and 6 deletions Show diff stats
links/image_crawler.py
 from bs4 import BeautifulSoup
 from urllib.request import urlopen
+from urllib.parse import urlparse
 import urllib.request
  
 def make_soup(url):
@@ -20,6 +21,8 @@ def get_images(url,slug):
         return ("Use default image",downloaded)
     images = [img for img in soup.findAll('img')]
     image_links = [each.get('src') for each in images]
+    parsed_uri = urlparse(url)
+    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
     link_slug = slug
     filename = ''
     for each in image_links:
@@ -46,15 +49,25 @@ def get_images(url,slug):
                 filename = '.jpeg'
             if not booleano:
                 continue
-
-            if each[0] + each[1] == '//' or each[0] == '/':
-                each = 'http:'+each
-            if each[0:4] != 'http' and each[0:5] != 'https':
-                each = url[0:url.index('/',8)] + each
             caminho = "links/static/images/"
             try:
                 urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
                 downloaded = True
             except Exception:
-                continue
+                try:
+                    aux = domain + each
+                    urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename)
+                    downloaded = True
+                except Exception as e:
+                    try:
+                        aux2 = url[0:url.index('/',8)] + each
+                        urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename)
+                        downloaded = True
+                    except Exception as e:
+                        try:
+                            aux3 = 'http:' + each
+                            urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename)
+                            downloaded = True
+                        except Exception as e:
+                            continue
     return filename,downloaded
1	1	from bs4 import BeautifulSoup
2	2	from urllib.request import urlopen
	3	+from urllib.parse import urlparse
3	4	import urllib.request
4	5
5	6	def make_soup(url):
...	...	@@ -20,6 +21,8 @@ def get_images(url,slug):
20	21	return ("Use default image",downloaded)
21	22	images = [img for img in soup.findAll('img')]
22	23	image_links = [each.get('src') for each in images]
	24	+ parsed_uri = urlparse(url)
	25	+ domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
23	26	link_slug = slug
24	27	filename = ''
25	28	for each in image_links:
...	...	@@ -46,15 +49,25 @@ def get_images(url,slug):
46	49	filename = '.jpeg'
47	50	if not booleano:
48	51	continue
49		-
50		- if each[0] + each[1] == '//' or each[0] == '/':
51		- each = 'http:'+each
52		- if each[0:4] != 'http' and each[0:5] != 'https':
53		- each = url[0:url.index('/',8)] + each
54	52	caminho = "links/static/images/"
55	53	try:
56	54	urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
57	55	downloaded = True
58	56	except Exception:
59		- continue
	57	+ try:
	58	+ aux = domain + each
	59	+ urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename)
	60	+ downloaded = True
	61	+ except Exception as e:
	62	+ try:
	63	+ aux2 = url[0:url.index('/',8)] + each
	64	+ urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename)
	65	+ downloaded = True
	66	+ except Exception as e:
	67	+ try:
	68	+ aux3 = 'http:' + each
	69	+ urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename)
	70	+ downloaded = True
	71	+ except Exception as e:
	72	+ continue
60	73	return filename,downloaded
...	...