python scrapy 实战简书网站

发布时间:2019-04-22 22:10:29编辑:auto阅读(1994)

     

    1:创建项目

    2:创建爬虫

    3:编写start.py文件用于运行爬虫程序

    # -*- coding:utf-8 -*-
    #作者:    baikai  
    #创建时间: 2018/12/14 14:09 
    #文件:    start.py  
    #IDE:    PyCharm
    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl js".split())

    4:设置settings.py文件的相关设置

    爬取详情页数据

    编写items.py文件

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class ArticleItem(scrapy.Item):
        # 定义我们需要的存储数据字段
        title=scrapy.Field()
        content=scrapy.Field()
        article_id=scrapy.Field()
        origin_url=scrapy.Field()
        author=scrapy.Field()
        avatar=scrapy.Field()
        pub_time=scrapy.Field()

    编写js.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from jianshu_spider.items import ArticleItem
    
    class JsSpider(CrawlSpider):
        name = 'js'
        allowed_domains = ['jianshu.com']
        start_urls = ['https://www.jianshu.com/']
    
        rules = (
            # 匹配地址https://www.jianshu.com/p/d8804d18d638
            Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
        )
    
        def parse_detail(self, response):
            # 获取内容页数据并解析数据
            title=response.xpath("//h1[@class='title']/text()").get()
            #作者图像
            avatar=response.xpath("//a[@class='avatar']/img/@src").get()
            author=response.xpath("//span[@class='name']/a/text()").get()
            #发布时间
            pub_time=response.xpath("//span[@class='publish-time']/text()").get()
            #详情页id
            url=response.url
            #https://www.jianshu.com/p/d8804d18d638
            url1=url.split("?")[0]
            article_id=url1.split("/")[-1]
            #文章内容
            content=response.xpath("//div[@class='show-content']").get()
    
            item=ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                pub_time=pub_time,
                origin_url=response.url,
                article_id=article_id,
                content=content
            )
            yield item

     

    设计数据库和表

    数据库jianshu

    表article

    id设置为自动增长

     

     将爬取到的数据存储到mysql数据库中

     

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    from twisted.enterprise import adbapi
    from pymysql import cursors
    
    class JianshuSpiderPipeline(object):
        def __init__(self):
            dbparams = {
                'host': '127.0.0.1',
                'port': 3306,
                'user': 'root',
                'password': '8Wxx.ypa',
                'database': 'jianshu',
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**dbparams)
            self.cursor = self.conn.cursor()
            self._sql = None
    
        def process_item(self, item, spider):
            self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'],item['article_id']))
            self.conn.commit()
            return item
    
        @property
        def sql(self):
            if not self._sql:
                self._sql = """
                    insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s)
                    """
                return self._sql
            return self._sql

    运行start.py效果如下

     

关键字