Commit 114cf48a40eec70a5a1fe388c12154cea67bf518

Authored by Gustavo Bernardo
1 parent a482e6e9

Adding image crawler [Issue:#198]

Showing 1 changed file with 56 additions and 0 deletions   Show diff stats
links/image-crawler.py 0 → 100644
@@ -0,0 +1,56 @@ @@ -0,0 +1,56 @@
  1 +'''
  2 +from bs4 import BeautifulSoup
  3 +from urllib.request import urlopen
  4 +import urllib.request
  5 +
  6 +
  7 +def make_soup(url):
  8 + try:
  9 + html = urlopen(url).read()
  10 + return BeautifulSoup(html,"lxml")
  11 + except urllib.error.HTTPError as e:
  12 + return "Use default image"
  13 +
  14 +def get_images(url):
  15 + try:
  16 + soup = make_soup(url)
  17 + except:
  18 + return("Use default image")
  19 + if soup == None or type(soup) == str:
  20 + return "Use default image"
  21 + images = [img for img in soup.findAll('img')]
  22 + image_links = [each.get('src') for each in images]
  23 + contador = 0
  24 + for each in image_links:
  25 + booleano = False
  26 + if each != "":
  27 + if each == None:
  28 + continue
  29 + if 'jpg' in each:
  30 + booleano = True
  31 + pos = each.index("jpg")
  32 + each = each[0:pos+3]
  33 + elif 'png' in each:
  34 + booleano = True
  35 + pos = each.index("png")
  36 + each = each[0:pos+3]
  37 + elif 'jpeg' in each:
  38 + booleano = True
  39 + pos = each.index('jpeg')
  40 + each = each[0:pos+4]
  41 + if not booleano:
  42 + continue
  43 +
  44 + if each[0] + each[1] == '//' or each[0] == '/':
  45 + each = 'http:'+each
  46 + if each[0:4] != 'http' and each[0:5] != 'https':
  47 + each = url[0:url.index('/',8)] + each
  48 + contador += 1
  49 + caminho = ""
  50 + filename=each.split('/')[-1]
  51 + try:
  52 + urllib.request.urlretrieve(each,"%s"%(caminho)+str(contador)+filename)
  53 + except Exception:
  54 + continue
  55 +
  56 +'''