python 抓取美女图片

发布时间：2019-09-16 07:35:21编辑：auto阅读（1883）

之前见过别人写的抓取图片的python脚本，自己之前用正则写过，最近看到beautifulsoup 所以拿来练练手

# -*- coding:utf8 -*-
from bs4 import BeautifulSoup
import os, sys, urllib2,time,random
 
path = os.getcwd()                     
new_path = os.path.join(path,u'sexy')
if not os.path.isdir(new_path):
    os.mkdir(new_path)
 
def page_loop(page=1):
    url = 'http://sexy.faceks.com/tag/美女摄影?page=%s' % page
    print url
    content = urllib2.urlopen(url)
    soup = BeautifulSoup(content)
    my_girl = soup.findAll('a',attrs={'class':'img'})#先获取首页每个美女图片的进入链接
    for girl in my_girl:
        #link = girl.get('src')
        girlink = girl.get('href') 
        print girlink
        response = urllib2.urlopen(girlink)
        per_soup = BeautifulSoup(response)
        img_urls = per_soup.findAll('img',attrs={'class':None})
        #print img_urls
        for img_url in img_urls: #获取单个美女的所有图片链接 
            girlurl = img_url.get('src') 
            print girlurl 
            content2 = urllib2.urlopen(girlurl).read()
            with open(u'sexy'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999)),'wb') as code:
                code.write(content2)
page_loop()

效果图如下：

# -*- coding:utf8 -*-
# __author__ = 'jony'
from bs4 import BeautifulSoup
import os, sys, urllib2,time,random
import re
 
def GetUrl():
    url = 'http://www.27270.com/ent/meinvtupian/'
    header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
    request = urllib2.Request(url,None,header)
    response = urllib2.urlopen(request,None,timeout=10).read()
    #pattern = re.compile(r'<a href="(.*)" title="(.*)"> class="MMPic"><i><img src="(.*)" width="190" height="280"  alt=.*')#在一行无法正则获取，所以使用BeautifulSoup
    soup = BeautifulSoup(response,"html.parser", from_encoding="gb18030") #WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.所以gb18030
    #soup = BeautifulSoup(response,from_encoding='gb2312')
    #过滤 div为MeinvTuPianBox
    content = soup.find_all('div',attrs={'class':'MeinvTuPianBox'})
    #定义列表
    urls = []
    #titles = []
    #picurls = []
    for i in content:
        #再次过滤 MMpic 注意是a 不是div了
        for j in i.findAll('a',attrs={'class':'MMPic'}):
            urls.append(j.get('href'))
            #titles.append(j.get('title'))    
    return urls
def GetImage(*urls):
    header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}
    pattern = re.compile(r'<img alt=".*" src="(.*)" />')
    for url in urls:
        print url
        #获取初始的页面的图片
        try:
                 request = urllib2.Request(url,None,header)
                 response = urllib2.urlopen(request).read()
                 girlink = pattern.search(response).group(1)
                 print girlink  
                 req = urllib2.Request(girlink,None,header)
                 res = urllib2.urlopen(req,None,timeout=10).read()
                 with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
                     code.write(res)
        except:
             continue          
        #http://www.27270.com/ent/meinvtupian/2016/156239_20.html 第二十张图片的网址
        orignurl=url.split('.html')[0]
        for i in range(2,15):
            picurl = '%s_%s.html' % (orignurl,i)
            #print picurl
            try:
                request = urllib2.Request(picurl,None,header)
                response = urllib2.urlopen(request).read()
                girlink = pattern.search(response).group(1)
                print girlink  
                req = urllib2.Request(girlink,None,header)
                res = urllib2.urlopen(req,None,timeout=10).read()
                with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:
                    code.write(res)
            except:
                continue                 
if __name__ == '__main__':
    path = os.getcwd()                     
    new_path = os.path.join(path,u'PICTURE')
    if not os.path.isdir(new_path):
        os.mkdir(new_path)
    links = GetUrl()
    #print type(links)
    GetImage(*links)

关键字：

上一篇： python:ImportError:

下一篇： Python多线程threading用法



搜索

热门推荐

最新文章

Python搭建一个RAG系统(分片/检索/召回/重排序/生成)
 2237°
Browser-use:智能浏览器自动化(Web-Agent)
 2918°
使用 LangChain 实现本地 Agent
 2440°
使用 LangChain 构建本地 RAG 应用
 2386°
使用LLaMA-Factory微调大模型的function calling能力
 2963°
复现一个简单Agent系统
 2388°
LLaMA Factory-Lora微调实现声控语音多轮问答对话-1
 3196°
LLaMA Factory微调后的模型合并导出和部署-4
 5253°
LLaMA Factory微调模型的各种参数怎么设置-3
 5067°
LLaMA Factory构建高质量数据集-2
 3620°

博主信息

姓名：Run
职业：谜
邮箱：383697894@qq.com
定位：上海 · 松江

扫我打开

友情链接

百度 淘宝 腾讯 慕课网 CSDN 博客园 51cto博客