Commit 383d190b83d0c2cb71c293bf06d5df3f5cfe9317
1 parent
78344a3e
Exists in
master
and in
5 other branches
Removed old crawler [Issue:#126]
Showing
1 changed file
with
0 additions
and
53 deletions
Show diff stats
links/image-crawler.py
@@ -1,53 +0,0 @@ | @@ -1,53 +0,0 @@ | ||
1 | -from bs4 import BeautifulSoup | ||
2 | -from urllib.request import urlopen | ||
3 | -import urllib.request | ||
4 | - | ||
5 | - | ||
6 | -def make_soup(url): | ||
7 | - try: | ||
8 | - html = urlopen(url).read() | ||
9 | - return BeautifulSoup(html,"lxml") | ||
10 | - except urllib.error.HTTPError as e: | ||
11 | - return "Use default image" | ||
12 | - | ||
13 | -def get_images(url): | ||
14 | - try: | ||
15 | - soup = make_soup(url) | ||
16 | - except: | ||
17 | - return("Use default image") | ||
18 | - if soup == None or type(soup) == str: | ||
19 | - return "Use default image" | ||
20 | - images = [img for img in soup.findAll('img')] | ||
21 | - image_links = [each.get('src') for each in images] | ||
22 | - contador = 0 | ||
23 | - for each in image_links: | ||
24 | - booleano = False | ||
25 | - if each != "": | ||
26 | - if each == None: | ||
27 | - continue | ||
28 | - if 'jpg' in each: | ||
29 | - booleano = True | ||
30 | - pos = each.index("jpg") | ||
31 | - each = each[0:pos+3] | ||
32 | - elif 'png' in each: | ||
33 | - booleano = True | ||
34 | - pos = each.index("png") | ||
35 | - each = each[0:pos+3] | ||
36 | - elif 'jpeg' in each: | ||
37 | - booleano = True | ||
38 | - pos = each.index('jpeg') | ||
39 | - each = each[0:pos+4] | ||
40 | - if not booleano: | ||
41 | - continue | ||
42 | - | ||
43 | - if each[0] + each[1] == '//' or each[0] == '/': | ||
44 | - each = 'http:'+each | ||
45 | - if each[0:4] != 'http' and each[0:5] != 'https': | ||
46 | - each = url[0:url.index('/',8)] + each | ||
47 | - contador += 1 | ||
48 | - caminho = "" | ||
49 | - filename=each.split('/')[-1] | ||
50 | - try: | ||
51 | - urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename) | ||
52 | - except Exception: | ||
53 | - continue |