Commit 383d190b83d0c2cb71c293bf06d5df3f5cfe9317

Authored by Gustavo Bernardo
1 parent 78344a3e

Removed old crawler [Issue:#126]

Showing 1 changed file with 0 additions and 53 deletions   Show diff stats
links/image-crawler.py
@@ -1,53 +0,0 @@ @@ -1,53 +0,0 @@
1 -from bs4 import BeautifulSoup  
2 -from urllib.request import urlopen  
3 -import urllib.request  
4 -  
5 -  
6 -def make_soup(url):  
7 - try:  
8 - html = urlopen(url).read()  
9 - return BeautifulSoup(html,"lxml")  
10 - except urllib.error.HTTPError as e:  
11 - return "Use default image"  
12 -  
13 -def get_images(url):  
14 - try:  
15 - soup = make_soup(url)  
16 - except:  
17 - return("Use default image")  
18 - if soup == None or type(soup) == str:  
19 - return "Use default image"  
20 - images = [img for img in soup.findAll('img')]  
21 - image_links = [each.get('src') for each in images]  
22 - contador = 0  
23 - for each in image_links:  
24 - booleano = False  
25 - if each != "":  
26 - if each == None:  
27 - continue  
28 - if 'jpg' in each:  
29 - booleano = True  
30 - pos = each.index("jpg")  
31 - each = each[0:pos+3]  
32 - elif 'png' in each:  
33 - booleano = True  
34 - pos = each.index("png")  
35 - each = each[0:pos+3]  
36 - elif 'jpeg' in each:  
37 - booleano = True  
38 - pos = each.index('jpeg')  
39 - each = each[0:pos+4]  
40 - if not booleano:  
41 - continue  
42 -  
43 - if each[0] + each[1] == '//' or each[0] == '/':  
44 - each = 'http:'+each  
45 - if each[0:4] != 'http' and each[0:5] != 'https':  
46 - each = url[0:url.index('/',8)] + each  
47 - contador += 1  
48 - caminho = ""  
49 - filename=each.split('/')[-1]  
50 - try:  
51 - urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename)  
52 - except Exception:  
53 - continue