Commit 44df3ca654dbdb1d08767ae3b2696dc276531a82

Authored by Gustavo Bernardo
1 parent 39321fc0

Finished image_crawler [Issue:#198]

Showing 1 changed file with 60 additions and 0 deletions   Show diff stats
links/image_crawler.py 0 → 100644
... ... @@ -0,0 +1,60 @@
  1 +from bs4 import BeautifulSoup
  2 +from urllib.request import urlopen
  3 +import urllib.request
  4 +
  5 +def make_soup(url):
  6 + try:
  7 + html = urlopen(url).read()
  8 + return BeautifulSoup(html,"lxml")
  9 +
  10 + except urllib.error.HTTPError as e:
  11 + return "Use default image"
  12 +
  13 +def get_images(url,slug):
  14 + downloaded = False
  15 + try:
  16 + soup = make_soup(url)
  17 + except:
  18 + return("Use default image",downloaded)
  19 + if soup == None or type(soup) == str:
  20 + return ("Use default image",downloaded)
  21 + images = [img for img in soup.findAll('img')]
  22 + image_links = [each.get('src') for each in images]
  23 + link_slug = slug
  24 + filename = ''
  25 + for each in image_links:
  26 + if downloaded:
  27 + break
  28 + booleano = False
  29 + if each != "":
  30 + if each == None:
  31 + continue
  32 + if 'jpg' in each:
  33 + booleano = True
  34 + pos = each.index("jpg")
  35 + each = each[0:pos+3]
  36 + filename = '.jpg'
  37 + elif 'png' in each:
  38 + booleano = True
  39 + pos = each.index("png")
  40 + each = each[0:pos+3]
  41 + filename = '.png'
  42 + elif 'jpeg' in each:
  43 + booleano = True
  44 + pos = each.index('jpeg')
  45 + each = each[0:pos+4]
  46 + filename = '.jpeg'
  47 + if not booleano:
  48 + continue
  49 +
  50 + if each[0] + each[1] == '//' or each[0] == '/':
  51 + each = 'http:'+each
  52 + if each[0:4] != 'http' and each[0:5] != 'https':
  53 + each = url[0:url.index('/',8)] + each
  54 + caminho = "links/static/images/"
  55 + try:
  56 + urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
  57 + downloaded = True
  58 + except Exception:
  59 + continue
  60 + return filename,downloaded
... ...