diff --git a/science_article_add/science_article_add/spiders/wos_latest_increment.py b/science_article_add/science_article_add/spiders/wos_latest_increment.py index 68422c5..8900c4c 100644 --- a/science_article_add/science_article_add/spiders/wos_latest_increment.py +++ b/science_article_add/science_article_add/spiders/wos_latest_increment.py @@ -35,9 +35,9 @@ class WosLatestIncrementSpider(scrapy.Spider): def __init__(self, task_obj): scrapy.Spider.__init__(self) self.task_obj = task_obj - self.record_id = task_obj['id'] - self.org_id = task_obj['org_id'] - self.org_name = task_obj['org_name'] + self.record_id = task_obj['task_id'] + self.org_id = self.tolist(task_obj['org_id']) + self.org_name = self.tolist(task_obj['org_name']) self.query_id = task_obj['query_id'] self.query_content = task_obj['content'] self.query_condition = task_obj['task_condition'] @@ -45,6 +45,13 @@ class WosLatestIncrementSpider(scrapy.Spider): self.first_page = task_obj.get('first_page', 1) self._records_found = 0 + @staticmethod + def tolist(datas) -> list: + if isinstance(datas, (list, tuple, set)): + return list(set(datas)) + else: + raise TypeError("不支持的类型:%s" % (type(datas))) + async def start(self): full_query = self.query_content if self.query_condition is not None: @@ -62,9 +69,9 @@ class WosLatestIncrementSpider(scrapy.Spider): async def parse(self, response: JsonResponse, **kwargs): meta = response.meta request: scrapy.Request = response.request - task_query_id = self.query_id - task_org_id = self.org_id - task_record_id = self.record_id + task_query_id: int = self.query_id + task_org_id: list = self.org_id + task_record_id: int = self.record_id if response.status != 200: self.logger.warning(""" @@ -97,7 +104,7 @@ class WosLatestIncrementSpider(scrapy.Spider): relation_item = WosIdRelationItem() relation_item['third_id'] = record.get("uid") relation_item['query_ids'] = [task_query_id] - relation_item['school_ids'] = [task_org_id] + relation_item['school_ids'] = task_org_id relation_item['task_ids'] = [task_record_id] relation_item['updated_at'] = batch_time yield relation_item