python 抓取美女图片

发布时间:2019-09-16 07:35:21编辑:auto阅读(1564)

    之前见过别人写的抓取图片的python脚本,自己之前用正则写过,最近看到beautifulsoup 所以拿来练练手

    # -*- coding:utf8 -*-
    from bs4 import BeautifulSoup
    import os, sys, urllib2,time,random
     
    path = os.getcwd()                     
    new_path = os.path.join(path,u'sexy')
    if not os.path.isdir(new_path):
        os.mkdir(new_path)
     
    def page_loop(page=1):
        url = 'http://sexy.faceks.com/tag/美女摄影?page=%s' % page
        print url
        content = urllib2.urlopen(url)
        soup = BeautifulSoup(content)
        my_girl = soup.findAll('a',attrs={'class':'img'})#先获取首页每个美女图片的进入链接
        for girl in my_girl:
            #link = girl.get('src')
            girlink = girl.get('href') 
            print girlink
            response = urllib2.urlopen(girlink)
            per_soup = BeautifulSoup(response)
            img_urls = per_soup.findAll('img',attrs={'class':None})
            #print img_urls
            for img_url in img_urls: #获取单个美女的所有图片链接 
                girlurl = img_url.get('src') 
                print girlurl 
                content2 = urllib2.urlopen(girlurl).read()
                with open(u'sexy'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999)),'wb') as code:
                    code.write(content2)
    page_loop()


    效果图如下:

    wKioL1YJDgDDT0EUAALvokfzBTI986.jpg

    # -*- coding:utf8 -*-
    # __author__ = 'jony'
    from bs4 import BeautifulSoup
    import os, sys, urllib2,time,random
    import re
     
    def GetUrl():
        url = 'http://www.27270.com/ent/meinvtupian/'
        header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
        request = urllib2.Request(url,None,header)
        response = urllib2.urlopen(request,None,timeout=10).read()
        #pattern = re.compile(r'<a href="(.*)" title="(.*)"> class="MMPic"><i><img src="(.*)" width="190" height="280"  alt=.*')#在一行无法正则获取,所以使用BeautifulSoup
        soup = BeautifulSoup(response,"html.parser", from_encoding="gb18030") #WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.所以gb18030
        #soup = BeautifulSoup(response,from_encoding='gb2312')
        #过滤 div为MeinvTuPianBox
        content = soup.find_all('div',attrs={'class':'MeinvTuPianBox'})
        #定义列表
        urls = []
        #titles = []
        #picurls = []
        for i in content:
            #再次过滤 MMpic 注意是a 不是div了
            for j in i.findAll('a',attrs={'class':'MMPic'}):
                urls.append(j.get('href'))
                #titles.append(j.get('title'))    
        return urls
    def GetImage(*urls):
        header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
        pattern = re.compile(r'<img alt=".*" src="(.*)" />')
        for url in urls:
            print url
            #获取初始的页面的图片
            try:
                     request = urllib2.Request(url,None,header)
                     response = urllib2.urlopen(request).read()
                     girlink = pattern.search(response).group(1)
                     print girlink  
                     req = urllib2.Request(girlink,None,header)
                     res = urllib2.urlopen(req,None,timeout=10).read()
                     with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
                         code.write(res)
            except:
                 continue          
            #http://www.27270.com/ent/meinvtupian/2016/156239_20.html 第二十张图片的网址
            orignurl=url.split('.html')[0]
            for i in range(2,15):
                picurl = '%s_%s.html' % (orignurl,i)
                #print picurl
                try:
                    request = urllib2.Request(picurl,None,header)
                    response = urllib2.urlopen(request).read()
                    girlink = pattern.search(response).group(1)
                    print girlink  
                    req = urllib2.Request(girlink,None,header)
                    res = urllib2.urlopen(req,None,timeout=10).read()
                    with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
                        code.write(res)
                except:
                    continue                 
    if __name__ == '__main__':
        path = os.getcwd()                     
        new_path = os.path.join(path,u'PICTURE')
        if not os.path.isdir(new_path):
            os.mkdir(new_path)
        links = GetUrl()
        #print type(links)
        GetImage(*links)


关键字