发布时间:2019-09-26 07:27:17编辑:auto阅读(2106)
【用线程池并发检验代理有效性】
#encoding=utf-8 #author: walker #date: 2016-04-14 #summary: 用线程池并发检验代理有效性 import os, sys, time import requests from concurrent import futures cur_dir_fullpath = os.path.dirname(os.path.abspath(__file__)) Headers = { 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', } #检验单个代理的有效性 #如果有效,返回该proxy;否则,返回空字符串 def Check(desturl, proxy, feature): proxies = {'http': 'http://' + proxy} proxies = { 'http': proxy, 'https': proxy } r = None #声明 exMsg = None try: r = requests.get(url=desturl, headers=Headers, proxies=proxies, timeout=3) except: exMsg = '* ' + traceback.format_exc() #print(exMsg) finally: if 'r' in locals() and r: r.close() if exMsg: return '' if r.status_code != 200: return '' if r.content.decode('utf8').find(feature) < 0: return '' return proxy #输入代理列表(set/list),返回有效代理列表 def GetValidProxyPool(rawProxyPool, desturl, feature): validProxyList = list() #有效代理列表 pool = futures.ThreadPoolExecutor(8) futureList = list() for proxy in rawProxyPool: futureList.append(pool.submit(Check, desturl, proxy, feature)) print('\n submit done, waiting for responses\n') for future in futures.as_completed(futureList): proxy = future.result() print('proxy:' + proxy) if proxy: #有效代理 validProxyList.append(proxy) print('validProxyList size:' + str(len(validProxyList))) return validProxyList #获取原始代理池 def GetRawProxyPool(): rawProxyPool = set() #通过某种方式获取原始代理池...... return rawProxyPool if __name__ == "__main__": rawProxyPool = GetRawProxyPool() desturl = 'http://...' #需要通过代理访问的目标地址 feature = 'xxx' #目标网页的特征码 validProxyPool = GetValidProxyPool(rawProxyPool, desturl, feature)
【用协程并发检验代理有效性】
在 aiohttp 之外用协程(asyncio)实现异步网络请求的另外两种方式:
1、asyncio + socket(Python simple socket client/server using asyncio)
2、asyncio + requests + run_in_executor (How could I use requests in asyncio?)
#encoding=utf-8 #author: walker #date: 2017-03-28 #summary: 用协程并发检验代理有效性 #Python sys.version:3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] import os, sys, time import aiohttp import asyncio import traceback Headers = { 'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', } #检验单个代理的有效性 #如果有效,返回该proxy;否则,返回空字符串 async def Check(desturl, proxy, feature): proxy = 'http://' + proxy #print('proxy:' + proxy) exMsg = None try: async with aiohttp.ClientSession() as session: async with session.get(desturl, headers=Headers, proxy=proxy, timeout=10) as resp: #print(resp.status) assert resp.status == 200 #print(await resp.text()) html = await resp.text(encoding='utf-8') except: exMsg = '* ' + traceback.format_exc() #print(exMsg) if exMsg: return '' if html.find(feature) < 0: return '' return proxy #输入代理列表(set/list),返回有效代理列表 async def GetValidProxyPool(rawProxyPool, desturl, feature): print('GetValidProxyPool ...') validProxyList = list() #有效代理列表 coroList = list() for proxy in rawProxyPool: coroList.append(asyncio.ensure_future((Check(desturl, proxy, feature)))) totalSleepTime = 0 for f in asyncio.as_completed(coroList): proxy = await f #print('rtn proxy:' + proxy) if proxy: validProxyList.append(proxy) print('validProxyList size: %d' % len(validProxyList)) return validProxyList #获取原始代理池 def GetRawProxyPool(): rawProxyPool = set() #通过某种方式获取原始代理池...... return rawProxyPool if __name__ == "__main__": startTime = time.time() rawProxyPool = GetRawProxyPool() desturl = 'http://...' #需要通过代理访问的目标地址 feature = 'xxx' #目标网页的特征码 print('rawProxyPool size:%d' % len(rawProxyPool)) loop = asyncio.get_event_loop() validProxyList = loop.run_until_complete(GetValidProxyPool(rawProxyPool, desturl, feature)) loop.close() print('rawProxyPool size:%d' % len(validProxyList)) print('time cost:%.2fs' % (time.time()-startTime))
【相关阅读】
concurrent.futures.ThreadPoolExecutor (Python documentation,官方)
12.7 创建一个线程池 (python3-cookbook)
What is the best way to send multiple HTTP requests in Python 3? (stackoverflow)
*** walker ***
上一篇: python3使用cookie免登录爬取
下一篇: centos7安装python3 以及t
48447
47332
38211
35460
29917
26622
25594
20545
20234
18663
47°
54°
158°
109°
138°
242°
327°
328°
305°
394°