python 爬虫 5i5j房屋信息 获

发布时间:2019-03-01 10:48:56编辑:auto阅读(2188)

     1 from lxml import etree
     2 from selenium import webdriver
     3 import pymysql
     4 
     5 def Geturl(fullurl):#获取每个招聘网页的链接
     6     browser.get(fullurl)
     7     shouye_html_text = browser.page_source
     8     shouye_ele = etree.HTML(shouye_html_text)
     9     zf_list = shouye_ele.xpath('/html/body/div[4]/div[1]/div[2]/ul/li/div/h3/a/@href')#链接url
    10     zf_url_list  = []
    11     for zf_url_lost in zf_list:
    12         zf_url  = 'https://bj.5i5j.com'+zf_url_lost
    13         zf_url_list.append(zf_url)
    14     return zf_url_list
    15 def Getinfo(zp_url_list):
    16     for zp_url in zp_url_list:
    17         browser.get(zp_url)
    18         zp_info_html = browser.page_source
    19         zp_ele = etree.HTML(zp_info_html)
    20         zp_info_title = str(zp_ele.xpath('//html/body/div[3]/div[1]/div[1]/h1/text()')[0])
    21         zp_info_num = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/div/p[1]/text()')[0])+'元/月'#价格
    22         zp_info_type = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[2]/div/p[1]/text()')[0])#户型
    23         zp_info_zone = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[3]/div/p[1]/text()')[0])+'平米'#房屋大小
    24         zp_info_need_1 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/span/text()')[0])#房屋信息
    25         zp_info_need_2 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/a/text()')[0])#房屋信息
    26         zp_info_need = zp_info_need_1+zp_info_need_2
    27         connection = pymysql.connect(host='localhost', user='root', password='1234', db='5i5j', )
    28         try:
    29             with connection.cursor() as cursor:
    30                 sql = "INSERT INTO `5i5j_info` (`title`,`num`,`type`, `zone`,`need`) VALUES (%s,%s,%s,%s, %s)"
    31                 cursor.execute(sql, (zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need))
    32             connection.commit()
    33         finally:
    34             connection.close()
    35         print(zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need)
    36 if __name__ == '__main__':
    37     browser = webdriver.Chrome()
    38     pags = int(input('需要几页?'))
    39     for i in range(1,pags+1):
    40         url = 'https://bj.5i5j.com/zufang/huilongguan/n{}/'
    41         fullurl = url.format(str(i))
    42         zf_url_list = Geturl(fullurl)
    43         print(fullurl)
    44         # print(zf_url_list)
    45         Getinfo(zf_url_list)
    46     browser.close()

     

关键字