image_crawler.py 2.53 KB
Edit Raw Blame History Permalink



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73


from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urlparse
import urllib.request

def make_soup(url):
    try:
        html = urlopen(url).read()
        return BeautifulSoup(html,"lxml")

    except urllib.error.HTTPError as e:
        return "Use default image"

def get_images(url,slug):
    downloaded = False
    try:
        soup = make_soup(url)
    except:
        return("Use default image",downloaded)
    if soup == None or type(soup) == str:
        return ("Use default image",downloaded)
    images = [img for img in soup.findAll('img')]
    image_links = [each.get('src') for each in images]
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    link_slug = slug
    filename = ''
    for each in image_links:
        if downloaded:
            break
        booleano = False
        if each != "":
            if each == None:
                continue
            if 'jpg' in each:
                booleano = True
                pos = each.index("jpg")
                each = each[0:pos+3]
                filename = '.jpg'
            elif 'png' in each:
                booleano = True
                pos = each.index("png")
                each = each[0:pos+3]
                filename = '.png'
            elif 'jpeg' in each:
                booleano = True
                pos = each.index('jpeg')
                each = each[0:pos+4]
                filename = '.jpeg'
            if not booleano:
                continue
            caminho = "links/static/images/"
            try:
                urllib.request.urlretrieve(each,"%s"%(caminho)+str(link_slug)+filename)
                downloaded = True
            except Exception:
                try:
                    aux = domain + each
                    urllib.request.urlretrieve(aux,"%s"%(caminho)+str(link_slug)+filename)
                    downloaded = True
                except Exception as e:
                    try:
                        aux2 = url[0:url.index('/',8)] + each
                        urllib.request.urlretrieve(aux2,"%s"%(caminho)+str(link_slug)+filename)
                        downloaded = True
                    except Exception as e:
                        try:
                            aux3 = 'http:' + each
                            urllib.request.urlretrieve(aux3,"%s"%(caminho)+str(link_slug)+filename)
                            downloaded = True
                        except Exception as e:
                            continue
    return filename,downloaded