发布时间:2019-09-16 07:35:21编辑:auto阅读(1564)
之前见过别人写的抓取图片的python脚本,自己之前用正则写过,最近看到beautifulsoup 所以拿来练练手
# -*- coding:utf8 -*- from bs4 import BeautifulSoup import os, sys, urllib2,time,random path = os.getcwd() new_path = os.path.join(path,u'sexy') if not os.path.isdir(new_path): os.mkdir(new_path) def page_loop(page=1): url = 'http://sexy.faceks.com/tag/美女摄影?page=%s' % page print url content = urllib2.urlopen(url) soup = BeautifulSoup(content) my_girl = soup.findAll('a',attrs={'class':'img'})#先获取首页每个美女图片的进入链接 for girl in my_girl: #link = girl.get('src') girlink = girl.get('href') print girlink response = urllib2.urlopen(girlink) per_soup = BeautifulSoup(response) img_urls = per_soup.findAll('img',attrs={'class':None}) #print img_urls for img_url in img_urls: #获取单个美女的所有图片链接 girlurl = img_url.get('src') print girlurl content2 = urllib2.urlopen(girlurl).read() with open(u'sexy'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999)),'wb') as code: code.write(content2) page_loop()
效果图如下:
# -*- coding:utf8 -*- # __author__ = 'jony' from bs4 import BeautifulSoup import os, sys, urllib2,time,random import re def GetUrl(): url = 'http://www.27270.com/ent/meinvtupian/' header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'} request = urllib2.Request(url,None,header) response = urllib2.urlopen(request,None,timeout=10).read() #pattern = re.compile(r'<a href="(.*)" title="(.*)"> class="MMPic"><i><img src="(.*)" width="190" height="280" alt=.*')#在一行无法正则获取,所以使用BeautifulSoup soup = BeautifulSoup(response,"html.parser", from_encoding="gb18030") #WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.所以gb18030 #soup = BeautifulSoup(response,from_encoding='gb2312') #过滤 div为MeinvTuPianBox content = soup.find_all('div',attrs={'class':'MeinvTuPianBox'}) #定义列表 urls = [] #titles = [] #picurls = [] for i in content: #再次过滤 MMpic 注意是a 不是div了 for j in i.findAll('a',attrs={'class':'MMPic'}): urls.append(j.get('href')) #titles.append(j.get('title')) return urls def GetImage(*urls): header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'} pattern = re.compile(r'<img alt=".*" src="(.*)" />') for url in urls: print url #获取初始的页面的图片 try: request = urllib2.Request(url,None,header) response = urllib2.urlopen(request).read() girlink = pattern.search(response).group(1) print girlink req = urllib2.Request(girlink,None,header) res = urllib2.urlopen(req,None,timeout=10).read() with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code: code.write(res) except: continue #http://www.27270.com/ent/meinvtupian/2016/156239_20.html 第二十张图片的网址 orignurl=url.split('.html')[0] for i in range(2,15): picurl = '%s_%s.html' % (orignurl,i) #print picurl try: request = urllib2.Request(picurl,None,header) response = urllib2.urlopen(request).read() girlink = pattern.search(response).group(1) print girlink req = urllib2.Request(girlink,None,header) res = urllib2.urlopen(req,None,timeout=10).read() with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code: code.write(res) except: continue if __name__ == '__main__': path = os.getcwd() new_path = os.path.join(path,u'PICTURE') if not os.path.isdir(new_path): os.mkdir(new_path) links = GetUrl() #print type(links) GetImage(*links)
上一篇: python:ImportError:
下一篇: Python多线程threading用法
47842
46390
37281
34733
29313
25973
24914
19951
19544
18030
5792°
6413°
5927°
5961°
7064°
5911°
5944°
6438°
6404°
7778°