Python 爬虫 多进程清洗代理

发布时间:2019-03-03 09:59:08编辑:auto阅读(2203)

    利用多线程检测代理网站提供的免费代理是否可用

     1 import requests
     2 from lxml import etree
     3 import time
     4 import multiprocessing
     5 
     6 def get_all_proxy(queue):
     7     url = 'http://www.xicidaili.com/nn/1'
     8     headers = {
     9         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    10     }
    11     response = requests.get(url, headers=headers)
    12     html_ele = etree.HTML(response.text)
    13 
    14     ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
    15     port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
    16     # proxy_list = []
    17     for i in range(0,len(ip_eles)):
    18         proxy_str = 'http://' + ip_eles[i] + ':' + port_ele[i]
    19         #proxy_list.append(proxy_str)
    20         #print(proxy_str)
    21         queue.put(proxy_str)
    22 
    23 def check_one_proxy(proxy):
    24     try:
    25         #proxy = proxy_and_queue
    26         url = 'http://www.baidu.com/s?wd=ip'
    27         proxy_dict = {
    28             'http': proxy
    29         }
    30         try:
    31             response = requests.get(url, proxies=proxy_dict, timeout=5)
    32             if response.status_code == 200:
    33                 print(proxy)
    34                 return proxy
    35             else:
    36                 print('bad   '+proxy)
    37                 return proxy
    38         except:
    39             return None
    40     except Exception as e:
    41         print(e)
    42 
    43 if __name__ == '__main__':
    44     start_time = time.time()
    45     # 创建队列
    46     q = multiprocessing.Queue()
    47     # pool 进程池中, 要用的是下面的这个queue
    48     #result_q = multiprocessing.Manager().Queue()
    49     # 获取所有代理
    50     p = multiprocessing.Process(target=get_all_proxy, args=(q,))
    51     p.start()
    52     # proxy_list = get_all_proxy()
    53     # 检测代理的可用性
    54 
    55     pool = multiprocessing.Pool(30)
    56     result_list = []
    57     while True:
    58         try:
    59             proxy_str = q.get(timeout=5)
    60         except:
    61             break
    62         #print('apply_async 之前')
    63         #proxy_and_queue = [proxy_str, result_q]
    64         proxy_res = pool.apply_async(check_one_proxy, (proxy_str,))
    65         result_list.append(proxy_res)
    66     #valid_proxy_list = check_all_proxy(proxy_list)
    67 
    68     valid_proxy_list = []
    69     for proxy_res in result_list:
    70         result = proxy_res.get()
    71         if result is None:
    72             pass
    73         else:
    74             valid_proxy_list.append(result)
    75         #print(result)
    76     print('All proxy we can get:')
    77     print(valid_proxy_list)
    78     pool.close()
    79     pool.join()
    80     p.join()
    81 
    82     end_time = time.time()
    83     print('--'*30)
    84     # print(valid_proxy_list)
    85     print('耗时:' + str(end_time-start_time))

     

关键字