原理:
在spider爬虫文件中导入pymysql,定义好关于数据库存储的参数,通过sql语句查询某个字段是否重复从而实现去重增量的功能
def parseMainPage(self,response):
# 链接数据库
db = pymysql.connect(
host='localhost',
database='你的数据库名称',
user='root',
password='你的数据库密码',
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
#通过xpath解析url
urls = response.xpath('//div[@class="book-mid-info"]/h4')
for x in urls:
item = NovelItem()
novel_url = x.xpath('a/@href').extract()
all_url = response.urljoin(novel_url[0])
判断是否已经在数据库中,当不存在时,将item存入数据库
with db.cursor() as cursor:
# sql为你的查询语句
sql = "SELECT novel_url FROM novel WHERE novel_url= %s"
cursor.execute(sql, (all_url))
result = cursor.fetchone()
# 不在数据库中, 则插入
if result == None:
item['novel_url'] = all_url # 小说链接
for url in novel_url:
url = response.urljoin(url)
def parseMainPage(self,response):
# 链接数据库
db = pymysql.connect(
host='localhost',
database='你的数据库名称',
user='root',
password='你的数据库密码',
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
#通过xpath解析url
urls = response.xpath('//div[@class="book-mid-info"]/h4')
for x in urls:
item = NovelItem()
novel_url = x.xpath('a/@href').extract()
all_url = response.urljoin(novel_url[0])
判断是否已经在数据库中,当不存在时,将item存入数据库
with db.cursor() as cursor:
# sql为你的查询语句
sql = "SELECT novel_url FROM novel WHERE novel_url= %s"
cursor.execute(sql, (all_url))
result = cursor.fetchone()
# 不在数据库中, 则插入
if result == None:
item['novel_url'] = all_url # 小说链接
for url in novel_url:
url = response.urljoin(url)