From e3a23ad33e91a509978a954f8d769f491fecb2a5 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Thu, 6 Nov 2025 16:06:19 +0800 Subject: [PATCH] =?UTF-8?q?add:wos=E6=9D=A5=E6=BA=90=E4=BD=BF=E7=94=A8star?= =?UTF-8?q?terapi=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/wos_latest_increment.py | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/science_article_add/science_article_add/spiders/wos_latest_increment.py b/science_article_add/science_article_add/spiders/wos_latest_increment.py index e058691..8808361 100644 --- a/science_article_add/science_article_add/spiders/wos_latest_increment.py +++ b/science_article_add/science_article_add/spiders/wos_latest_increment.py @@ -6,7 +6,7 @@ from copy import deepcopy import scrapy from scrapy.http.response.json import JsonResponse -from science_article_add.items.wos import WosCitedNumberItem +from science_article_add.items.wos import WosCitedNumberItem, WosIdRelationItem from science_article_add.models import wos_model as model from science_article_add.configs import wos as config from science_article_add.utils import tools @@ -22,12 +22,15 @@ class WosLatestIncrementSpider(scrapy.Spider): # start_urls = ["https://wos-api.clarivate.com/api/woslite"] custom_settings = dict( DOWNLOADER_MIDDLEWARES={ - "science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500 + "science_article_add.middlewares.wos.WosStarterApiXkeyDownloaderMiddleware": 500 }, ITEM_PIPELINES={ - "science_article_add.pipelines.ScienceAddBufferPipeline": 300, - } + "science_article_add.pipelines.mongo_pipeline.MongoPipeline": 300, + "science_article_add.pipelines.duptodo.DupTodoPipeline": 400, + }, + LOG_LEVEL="INFO" ) + source = "wos" def __init__(self, task_obj): scrapy.Spider.__init__(self) @@ -44,9 +47,15 @@ class WosLatestIncrementSpider(scrapy.Spider): async def start(self): full_query = self.query_content if self.query_condition is not None: - full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition} + full_query = '%(query)s%(condition)s' % { + 'query': f'({self.query_content})' if self.query_condition else self.query_content, + 'condition': ' ' + self.query_condition if self.query_condition else '' + } + self.logger.info(f'full_query: {full_query}') meta = dict(q=full_query, page=self.first_page, limit=50, detail="short") - yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta)), + params = model.starter_documents_get(**meta) + enc_params = urlencode(params, doseq=True) + yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + enc_params, meta=meta) async def parse(self, response: JsonResponse, **kwargs): @@ -55,7 +64,6 @@ class WosLatestIncrementSpider(scrapy.Spider): task_query_id = self.query_id task_org_id = self.org_id task_record_id = self.record_id - self.logger.debug('%s: %s' % ('parse_query_api', meta)) if response.status != 200: self.logger.warning(""" @@ -73,19 +81,35 @@ class WosLatestIncrementSpider(scrapy.Spider): self.logger.info(""" 检索式: %s 检索到结果: %s""" % (req_meta.get("q"), records_found)) + self.set_records_found(records_found) max_page = req_meta["MAX_PAGE"] = math.ceil(records_found / config.WOS_STARTER_PER_PAGE_LIMIT) batch_time = datetime.now() hits: list = resp_result.get("hits") for record in hits: cited_num = tools.get_list_key(array=record.get("citations"), target="count", condition=("db", "WOS")) if cited_num: - cited_item = WosCitedNumberItem(third_id=record.get("uid"), cited=cited_num, updated_at=batch_time) + cited_item = WosCitedNumberItem() + cited_item['third_id'] = record.get("uid") + cited_item['cited'] = cited_num + cited_item['updated_at'] = batch_time yield cited_item - yield WosIdRelationItem(third_id=record.get("uid"), query_ids=[task_query_id], updated_at=batch_time) + relation_item = WosIdRelationItem() + relation_item['third_id'] = record.get("uid") + relation_item['query_ids'] = [task_query_id] + relation_item['school_ids'] = [task_org_id] + relation_item['task_ids'] = [task_record_id] + relation_item['updated_at'] = batch_time + yield relation_item if current_page < max_page: meta_copy: dict = deepcopy(req_meta) meta_copy.update({'page': meta_copy['page'] + 1}) - yield scrapy.Request(config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)), - meta=meta_copy, - task_query_id=request.task_query_id) + yield scrapy.Request( + config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)), + meta=meta_copy) + + def set_records_found(self, val): + self._records_found = val + + def get_records_found(self) -> int: + return self._records_found