Python爬虫项目--爬取链家热门城市

发布时间:2019-03-31 20:51:49编辑:auto阅读(2088)

    本次实战是利用爬虫爬取链家的新房(声明: 内容仅用于学习交流, 请勿用作商业用途)

    环境

    win8, python 3.7, pycharm

    正文

    1. 目标网站分析

    通过分析, 找出相关url, 确定请求方式, 是否存在js加密等.

    2. 新建scrapy项目

    1. 在cmd命令行窗口中输入以下命令, 创建lianjia项目

    scrapy startproject lianjia

    2. 在cmd中进入lianjia文件中, 创建Spider文件

    cd lianjia
    scrapy genspider -t crawl xinfang lianjia.com

    这次创建的是CrawlSpider类, 该类适用于批量爬取网页

    3. 新建main.py文件, 用于执行scrapy项目文件

    到现在, 项目就创建完成了, 下面开始编写项目

    3 定义字段

    在items.py文件中定义需要的爬取的字段信息

    import scrapy
    from scrapy.item import Item, Field
    
    class LianjiaItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        city = Field()          #城市名
        name = Field()          #楼盘名
        type = Field()          #物业类型
        status = Field()        #状态
        region = Field()        #所属区域
        street = Field()        #街道
        address = Field()       #具体地址
        area = Field()          #面积
        average_price = Field() #平均价格
        total_price = Field()   #总价
        tags = Field()          #标签

    4 爬虫主程序

    在xinfang.py文件中编写我们的爬虫主程序

    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from lianjia.items import LianjiaItem
    
    class XinfangSpider(CrawlSpider):
        name = 'xinfang'
        allowed_domains = ['lianjia.com']
        start_urls = ['https://bj.fang.lianjia.com/']
        #定义爬取的规则, LinkExtractor是用来提取链接(其中,allow指允许的链接格式, restrict_xpaths指链接处于网页结构中的位置), follow为True表示跟进提取出的链接, callback则是调用函数
        rules = (
            Rule(LinkExtractor(allow=r'\.fang.*com/$', restrict_xpaths='//div[@class="footer"]//div[@class="link-list"]/div[2]/dd'), follow=True),
            Rule(LinkExtractor(allow=r'.*loupan/$', restrict_xpaths='//div[@class="xinfang-all"]/div/a'),callback= 'parse_item', follow=True)
        )
        def parse_item(self, response):
            '''请求每页的url''''
            counts = response.xpath('//div[@class="page-box"]/@data-total-count').extract_first()
            pages = int(counts) // 10 + 2
            #由于页数最多为100, 加条件判断
            if pages > 100:
                pages = 101
            for page in range(1, pages):
                url = response.url + "pg" + str(page)
                yield scrapy.Request(url, callback=self.parse_detail, dont_filter=False)
    
        def parse_detail(self, response):
            '''解析网页内容'''
            item = LianjiaItem()
            item["title"] = response.xpath('//div[@class="resblock-have-find"]/span[3]/text()').extract_first()[1:]
            infos = response.xpath('//ul[@class="resblock-list-wrapper"]/li')
            for info in infos:
                item["city"] = info.xpath('div/div[1]/a/text()').extract_first()
                item["type"] = info.xpath('div/div[1]/span[1]/text()').extract_first()
                item["status"] = info.xpath('div/div[1]/span[2]/text()').extract_first()
                item["region"] = info.xpath('div/div[2]/span[1]/text()').extract_first()
                item["street"] = info.xpath('div/div[2]/span[2]/text()').extract_first()
                item["address"] = info.xpath('div/div[2]/a/text()').extract_first().replace(",", "")
                item["area"] = info.xpath('div/div[@class="resblock-area"]/span/text()').extract_first()
                item["average_price"] = "".join(info.xpath('div//div[@class="main-price"]//text()').extract()).replace(" ", "")
                item["total_price"] = info.xpath('div//div[@class="second"]/text()').extract_first()
                item["tags"] = ";".join(info.xpath('div//div[@class="resblock-tag"]//text()').extract()).replace(" ","").replace("\n", "")
                yield item

    5 保存到Mysql数据库

    在pipelines.py文件中编辑如下代码

    import pymysql
    class LianjiaPipeline(object):
        def __init__(self):
            #创建数据库连接对象
            self.db = pymysql.connect(
                host = "localhost",
                user = "root",
                password = "1234",
                port = 3306,
                db = "lianjia",
                charset = "utf8"
            )
            self.cursor = self.db.cursor()
        def process_item(self, item, spider):
            #存储到数据库中
            sql = "INSERT INTO xinfang(city, name, type, status, region, street, address, area, average_price, total_price, tags) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            data = (item["city"], item["name"], item["type"], item["status"], item["region"], item["street"], item["address"], item["area"], item["average_price"], item["total_price"], item["tags"])
            try:
                self.cursor.execute(sql, data)
                self.db.commit()
            except:
                self.db.rollback()
            finally:
                return item

    6 反反爬措施

    由于是批量性爬取, 有必要采取些反反爬措施, 我这里采用的是免费的IP代理. 在middlewares.py中编辑如下代码:

    from scrapy import signals
    import logging
    import requests
    class ProxyMiddleware(object):
        def __init__(self, proxy):
            self.logger = logging.getLogger(__name__)
            self.proxy = proxy
        @classmethod
        def from_crawler(cls, crawler):
            '''获取随机代理的api接口'''
            settings = crawler.settings
            return cls(
                proxy=settings.get('RANDOM_PROXY')
            )
        def get_random_proxy(self):
         '''获取随机代理'''
            try:
                response = requests.get(self.proxy)
                if response.status_code == 200:
                    proxy = response.text
                    return proxy
            except:
                return False
        def process_request(self, request, spider):
         '''使用随机生成的代理请求'''
            proxy = self.get_random_proxy()
            if proxy:
                url = 'http://' + str(proxy)
                self.logger.debug('本次使用代理'+ proxy)
                request.meta['proxy'] = url

    7  配置settings文件

    import random
    RANDOM_PROXY = "http://localhost:6686/random"
    BOT_NAME = 'lianjia'
    SPIDER_MODULES = ['lianjia.spiders']
    NEWSPIDER_MODULE = 'lianjia.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = random.random()*2
    COOKIES_ENABLED = False
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
    }
    DOWNLOADER_MIDDLEWARES = {
       'lianjia.middlewares.ProxyMiddleware': 543
    }
    ITEM_PIPELINES = {
       'lianjia.pipelines.LianjiaPipeline': 300,
    }

    8 执行项目文件

    在mian.py中执行如下命令

    from scrapy import cmdline
    cmdline.execute('scrapy crawl xinfang'.split())

    scrapy项目即可开始执行, 最后爬取到1万4千多条数据.

关键字