scrapy通过sql语句查询判断实现数据库增量爬取


原理: 在spider爬虫文件中导入pymysql,定义好关于数据库存储的参数,通过sql语句查询某个字段是否重复从而实现去重增量的功能
    def parseMainPage(self,response):
        # 链接数据库
        db = pymysql.connect(
            host='localhost',
            database='你的数据库名称',
            user='root',
            password='你的数据库密码',
            port=3306,
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor,
            use_unicode=True
        #通过xpath解析url
        urls = response.xpath('//div[@class="book-mid-info"]/h4')
        for x in urls:
            item = NovelItem()
            novel_url = x.xpath('a/@href').extract()
            all_url = response.urljoin(novel_url[0])       
判断是否已经在数据库中,当不存在时,将item存入数据库
            with db.cursor() as cursor:
                # sql为你的查询语句
                sql = "SELECT novel_url FROM novel WHERE novel_url= %s"
                cursor.execute(sql, (all_url))
                result = cursor.fetchone()
                # 不在数据库中, 则插入
                if result == None:
                    item['novel_url'] = all_url  # 小说链接
                    for url in novel_url:
                        url = response.urljoin(url)