功能:从p_w_picpath.baidu.com自动翻页下载图片的python程序
用法:运行程序后,输入关键字即可
#!/usr/bin/python
# filename: getbaidupic.py
# description: get p_w_picpaths from p_w_picpath.baidu.com
# author: cjcse
# version: v 0.21
import urllib
import htmllib
import formatter
import string
import os
import sys
import time
import thread
#import threading
class Parser(htmllib.HTMLParser):
#return a dictionary mapping anchor texts to lists of associated hyperlinks
def __init__(self, verbose=0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def GetJpg(url):
try:
global save
global total
global successed
global failed
total += 1
seps = url.split("/")
size = len(seps)
name = seps[size-1]
name = save + "\\" + name
i = 1
list = name.split(".")
while os.path.exists(name):
if len(list) == 2:
name = list[0] + "_" + repr(i) + "." + list[1]
else:
name = list[0] + "_" + repr(i)
i += 1
dat = urllib.urlopen(url).read()
if len(dat) < 11024:
print url + "\t[Failed]"
return
op = open(name, "wb")
if not op:
print url + "\t[Failed]"
exit()
op.write(dat)
op.close()
print url + "\t[OK]"
except:
print url + "\t[Failed]"
def GetBaiduNextPage(url):
global pn
url += "&rn=" + repr(rn) + "&pn=" + repr(pn) + "&ln=" + repr(ln)
pn += 18
return url
def GetAllJpg(url):
html = urllib.urlopen(url).read()
p = Parser()
p.feed(html)
p.close()
cnt = 0
for k, v in p.anchors.items():
for uri in v:
if uri.find(".jpg") != -1:
ls = uri.split("&")
for st in ls:
url2 = st.split("=")
for st2 in url2:
st2 = string.lower(st2)
if string.find(st2, "http://") != -1 and string.find(st2, ".jpg") != -1:
try:
GetJpg(st2)
except:
continue
print "---------------------------------------------------------------------"
print "Description: Get p_w_picpaths from p_w_picpath.baidu.com. "
print "Author: cjcse from CU."
print "version: v 0.2."
print "---------------------------------------------------------------------"
str = raw_input("Input your keywords: ")
while (len(str) == 0):
str = raw_input("Keyword: ")
url = "http://p_w_picpath.baidu.com/i?ct=201326592&cl=2&lm=-1&tn=baidup_w_picpath&pv=&word=" + str + "&z=5"
try:
if not os.path.exists("c:\\p_w_picpath_baidu"):
os.mkdir("c:\\p_w_picpath_baidu")
except:
print "Failed to create directory in disk c:"
exit()
pages = 50
save = "c:\\p_w_picpath_baidu"
print "The p_w_picpaths will be stored in folder \"c:\\p_w_picpath_baidu\"."
rn = 21
pn = 18
ln = 2000
for i in range(0, pages):
thread.start_new_thread(GetAllJpg,(url,))
url = GetBaiduNextPage(url)
while True:
pass