Commit 383d190b83d0c2cb71c293bf06d5df3f5cfe9317

Authored by Gustavo Bernardo
1 parent 78344a3e

Removed old crawler [Issue:#126]

Showing 1 changed file with 0 additions and 53 deletions   Show diff stats
links/image-crawler.py
... ... @@ -1,53 +0,0 @@
1   -from bs4 import BeautifulSoup
2   -from urllib.request import urlopen
3   -import urllib.request
4   -
5   -
6   -def make_soup(url):
7   - try:
8   - html = urlopen(url).read()
9   - return BeautifulSoup(html,"lxml")
10   - except urllib.error.HTTPError as e:
11   - return "Use default image"
12   -
13   -def get_images(url):
14   - try:
15   - soup = make_soup(url)
16   - except:
17   - return("Use default image")
18   - if soup == None or type(soup) == str:
19   - return "Use default image"
20   - images = [img for img in soup.findAll('img')]
21   - image_links = [each.get('src') for each in images]
22   - contador = 0
23   - for each in image_links:
24   - booleano = False
25   - if each != "":
26   - if each == None:
27   - continue
28   - if 'jpg' in each:
29   - booleano = True
30   - pos = each.index("jpg")
31   - each = each[0:pos+3]
32   - elif 'png' in each:
33   - booleano = True
34   - pos = each.index("png")
35   - each = each[0:pos+3]
36   - elif 'jpeg' in each:
37   - booleano = True
38   - pos = each.index('jpeg')
39   - each = each[0:pos+4]
40   - if not booleano:
41   - continue
42   -
43   - if each[0] + each[1] == '//' or each[0] == '/':
44   - each = 'http:'+each
45   - if each[0:4] != 'http' and each[0:5] != 'https':
46   - each = url[0:url.index('/',8)] + each
47   - contador += 1
48   - caminho = ""
49   - filename=each.split('/')[-1]
50   - try:
51   - urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename)
52   - except Exception:
53   - continue