python2.7爬取可用代理IP

发布时间:2019-08-29 07:34:28编辑:auto阅读(1489)

    import urllib2

    import random

    import time

    import re

    #from lxml import etree  #第三方模块



    def get_proxy(page):

    headers = {

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'

    }

    req = urllib2.Request('http://www.xicidaili.com/nn/{}'.format(page),headers=headers) #构造一个Request对象

    response = urllib2.urlopen(req) #发送请求

    html = response.read()

    proxy_list = []

    ip_port_list = re.findall(r'<tr class=.*?>(.*?)</tr>',html,re.S)

    #ip_list = re.findall(r'\d+\.\d+\.\d+\.\d+\',html)

    print len(ip_port_list)

    for i in ip_port_list:

    ip = re.findall('\d+\.\d+\.\d+\.\d+\.',i)[0]

    port = re.findall(r'<td>(\d+)</td>',i)[0]

    #print ip,port  #打印测试

    proxy = '{}:{}'.format(ip,port)

    proxy_list.append(proxy)

    return proxy_list

    def proxy_read(proxy_list,i):

    proxy = proxy_list[i]

    print u'当前代理IP:{}'.format(proxy)

    sleep_time = random.randint(1,3)

    print '等待{}秒'.format(sleep_time)

    time.sleep(sleep_time)

    #urllib2 里面的方法

    proxt_suport = urllib2.ProxyHandler({'http':proxy}) #构建代理Handler

    opener = urllib2.build_opener(proxt_suport) #通过build_opener方法来使用Handler对象,然后创建opener对象

    urllib2.install_opener(opener) #把opener对象变成全局的,之后使用的urlopen对象都是全局的

    req = urllib2.Request('http://httpbin.org/ip')

    try:

    html = urllib2.urlopen(req).read()

    print html

    except Exception as e:

    print e

    print u'***打开失败***'

    print u'当前ip不可用'


    if __name__ == '__name__':

        proxy_list = get_proxy(1)

    print '开始测试'

    for i in range(100):

    proxy.read(proxt_list,i)


关键字