发布时间:2018-07-25 08:24:53编辑:Run阅读(7416)
用Selenium&PhantomJS完成的网络爬虫,最适合使用的情形是爬取有JavaScript的网站,用来爬其他的站点也一样给力
准备环境
将在https://www.kuaidaili.com/ops/proxylist/1/中获取已经验证好了的代理服务器,打开目标网站
目标分析:
所有的代理信息都在tr标签,或者tr class='odd'里面
再来分析下一页的地址:后面跟着的数字代表第几页
项目实施:
在目录下创建一个getProxyFromDaili.py文件,代码如下:
#!/usr/bin/env python # coding: utf-8 from selenium import webdriver from mylog import MyLog as mylog class Item(object): ip = None # 代理ip port = None # 代理端口 anonymous = None # 是否匿名 type = None # 类型 support = None # 支持的协议 position = None # 位置 responsive_speed = None # 响应速度 final_verification_time = None # 最后验证时间 class GetProxy(object): def __init__(self): self.startUrl = 'https://www.kuaidaili.com/ops/proxylist/' self.log = mylog() self.urls = self.getUrls() self.filename = 'proxy.txt' self.getProxyList(self.urls) def getUrls(self): urls = [] for i in range(1, 11): url = self.startUrl + str(i) urls.append(url) self.log.info("添加url:{}到urls列表".format(url)) return urls def getProxyList(self, urls): item = Item() browser = webdriver.PhantomJS() for url in urls: browser.get(url) browser.implicitly_wait(5) elements = browser.find_elements_by_xpath('//div[@id="freelist"]//tbody[@class="center"]/tr') for element in elements: item.ip = element.find_element_by_xpath('./td[1]').text item.port = element.find_element_by_xpath('./td[2]').text item.anonymous = element.find_element_by_xpath('./td[3]').text item.type = element.find_element_by_xpath('./td[4]').text item.support = element.find_element_by_xpath('./td[5]').text item.position = element.find_element_by_xpath('./td[6]').text item.responsive_speed = element.find_element_by_xpath('./td[7]').text item.final_verification_time = element.find_element_by_xpath('./td[8]').text self.log.info('添加proxy {}:{} 到proxyList'.format(item.ip, item.port)) self.log.info('添加proxy到{}'.format(self.filename)) with open(self.filename, 'a', encoding='utf8') as fp: fp.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\r\n".format( item.ip, item.port, item.anonymous, item.type, item.support, item.position, item.responsive_speed, item.final_verification_time )) browser.quit() if __name__ == '__main__': GP = GetProxy()
创建mylog.py文件,代码如下:
#!/usr/bin/env python # coding: utf-8 import logging import getpass import sys # 定义MyLog类 class MyLog(object): def __init__(self): self.user = getpass.getuser() # 获取用户 self.logger = logging.getLogger(self.user) self.logger.setLevel(logging.DEBUG) # 日志文件名 self.logfile = sys.argv[0][0:-3] + '.log' # 动态获取调用文件的名字 self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(message)-12s\r\n') # 日志显示到屏幕上并输出到日志文件内 self.logHand = logging.FileHandler(self.logfile, encoding='utf-8') self.logHand.setFormatter(self.formatter) self.logHand.setLevel(logging.DEBUG) self.logHandSt = logging.StreamHandler() self.logHandSt.setFormatter(self.formatter) self.logHandSt.setLevel(logging.DEBUG) self.logger.addHandler(self.logHand) self.logger.addHandler(self.logHandSt) # 日志的5个级别对应以下的5个函数 def debug(self, msg): self.logger.debug(msg) def info(self, msg): self.logger.info(msg) def warn(self, msg): self.logger.warn(msg) def error(self, msg): self.logger.error(msg) def critical(self, msg): self.logger.critical(msg) if __name__ == '__main__': mylog = MyLog() mylog.debug(u"I'm debug 中文测试") mylog.info(u"I'm info 中文测试") mylog.warn(u"I'm warn 中文测试") mylog.error(u"I'm error 中文测试") mylog.critical(u"I'm critical 中文测试")
pycharm运行截图
proxy.txt文件截图
47905
46483
37396
34798
29368
26030
24999
19999
19618
18100
5836°
6474°
5982°
6003°
7114°
5954°
6004°
6491°
6457°
7835°