diff --git a/science_article_cnki/science_article_cnki/spiders/cnki_article_crossdb.py b/science_article_cnki/science_article_cnki/spiders/cnki_article_crossdb.py index 97bda17..43f1bb7 100644 --- a/science_article_cnki/science_article_cnki/spiders/cnki_article_crossdb.py +++ b/science_article_cnki/science_article_cnki/spiders/cnki_article_crossdb.py @@ -78,6 +78,7 @@ class CnkiArticleCrossdbSpider(scrapy.Spider): # ---------------------------------------------- 提取列表文章的逻辑 ---------------------------------------------- tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr') for tr_node in tr_nodes: + check_v = tr_node.xpath('./td[@class="seq"]/input/@value').get() # 下载导出用的v article_title = tr_node.xpath('./td[@class="name"]/a//text()').getall() # 文章标题 article_title = article_title and ''.join(article_title) article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接(有v值) @@ -87,7 +88,7 @@ class CnkiArticleCrossdbSpider(scrapy.Spider): cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串 param = tools.url_parse(article_link) - v = param.get('v') + v = check_v ti_format = ti2format(article_title) ti_unique = ti2unique_type2(ti=ti_format, so=source_title) @@ -131,6 +132,7 @@ class CnkiArticleCrossdbSpider(scrapy.Spider): # ---------------------------------------------- 提取列表文章的逻辑 ---------------------------------------------- tr_nodes = response.xpath('//div[@id="gridTable"]//table[@class="result-table-list"]/tbody/tr') for tr_node in tr_nodes: + check_v = tr_node.xpath('./td[@class="seq"]/input/@value').get() # 下载导出用的v article_title = tr_node.xpath('./td[@class="name"]/a/text()').get() # 文章标题 article_link = tr_node.xpath('./td[@class="name"]/a/@href').get() # 文章链接(有v值) source_title = tr_node.xpath('./td[@class="source"]/*/a/text()').get() # 出版物名称(刊名) @@ -139,7 +141,7 @@ class CnkiArticleCrossdbSpider(scrapy.Spider): cited_str = tr_node.xpath('./td[@class="quote"]/span/a/text()').get() # 被引量字符串 param = tools.url_parse(article_link) - v = param.get('v') + v = check_v ti_format = ti2format(article_title) ti_unique = ti2unique_type2(ti=ti_format, so=source_title) if third_id: