Adding image crawler [Issue:#198]

Gustavo Bernardo
1 parent a482e6e9
Showing 1 changed file with 56 additions and 0 deletions Show diff stats
links/image-crawler.py
@@ -0,0 +1,56 @@
+'''
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+import urllib.request
+
+
+def make_soup(url):
+    try:
+        html = urlopen(url).read()
+        return BeautifulSoup(html,"lxml")
+    except urllib.error.HTTPError as e:
+        return "Use default image"
+
+def get_images(url):
+    try:
+        soup = make_soup(url)
+    except:
+        return("Use default image")
+    if soup == None or type(soup) == str:
+        return "Use default image"
+    images = [img for img in soup.findAll('img')]
+    image_links = [each.get('src') for each in images]
+    contador = 0
+    for each in image_links:
+        booleano = False
+        if each != "":
+            if each == None:
+                continue
+            if 'jpg' in each:
+                booleano = True
+                pos = each.index("jpg")
+                each = each[0:pos+3]
+            elif 'png' in each:
+                booleano = True
+                pos = each.index("png")
+                each = each[0:pos+3]
+            elif 'jpeg' in each:
+                booleano = True
+                pos = each.index('jpeg')
+                each = each[0:pos+4]
+            if not booleano:
+                continue
+            
+            if each[0] + each[1] == '//' or each[0] == '/':
+                each = 'http:'+each
+            if each[0:4] != 'http' and each[0:5] != 'https':
+                each = url[0:url.index('/',8)] + each
+            contador += 1
+            caminho = ""
+            filename=each.split('/')[-1]
+            try:
+                urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename)
+            except Exception:
+                continue
+
+'''
@@ -0,0 +1,56 @@		@@ -0,0 +1,56 @@
	1	+'''
	2	+from bs4 import BeautifulSoup
	3	+from urllib.request import urlopen
	4	+import urllib.request
	5	+
	6	+
	7	+def make_soup(url):
	8	+ try:
	9	+ html = urlopen(url).read()
	10	+ return BeautifulSoup(html,"lxml")
	11	+ except urllib.error.HTTPError as e:
	12	+ return "Use default image"
	13	+
	14	+def get_images(url):
	15	+ try:
	16	+ soup = make_soup(url)
	17	+ except:
	18	+ return("Use default image")
	19	+ if soup == None or type(soup) == str:
	20	+ return "Use default image"
	21	+ images = [img for img in soup.findAll('img')]
	22	+ image_links = [each.get('src') for each in images]
	23	+ contador = 0
	24	+ for each in image_links:
	25	+ booleano = False
	26	+ if each != "":
	27	+ if each == None:
	28	+ continue
	29	+ if 'jpg' in each:
	30	+ booleano = True
	31	+ pos = each.index("jpg")
	32	+ each = each[0:pos+3]
	33	+ elif 'png' in each:
	34	+ booleano = True
	35	+ pos = each.index("png")
	36	+ each = each[0:pos+3]
	37	+ elif 'jpeg' in each:
	38	+ booleano = True
	39	+ pos = each.index('jpeg')
	40	+ each = each[0:pos+4]
	41	+ if not booleano:
	42	+ continue
	43	+
	44	+ if each[0] + each[1] == '//' or each[0] == '/':
	45	+ each = 'http:'+each
	46	+ if each[0:4] != 'http' and each[0:5] != 'https':
	47	+ each = url[0:url.index('/',8)] + each
	48	+ contador += 1
	49	+ caminho = ""
	50	+ filename=each.split('/')[-1]
	51	+ try:
	52	+ urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename)
	53	+ except Exception:
	54	+ continue
	55	+
	56	+'''