|
|
|
|
@ -35,9 +35,9 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
def __init__(self, task_obj):
|
|
|
|
|
scrapy.Spider.__init__(self)
|
|
|
|
|
self.task_obj = task_obj
|
|
|
|
|
self.record_id = task_obj['id']
|
|
|
|
|
self.org_id = task_obj['org_id']
|
|
|
|
|
self.org_name = task_obj['org_name']
|
|
|
|
|
self.record_id = task_obj['task_id']
|
|
|
|
|
self.org_id = self.tolist(task_obj['org_id'])
|
|
|
|
|
self.org_name = self.tolist(task_obj['org_name'])
|
|
|
|
|
self.query_id = task_obj['query_id']
|
|
|
|
|
self.query_content = task_obj['content']
|
|
|
|
|
self.query_condition = task_obj['task_condition']
|
|
|
|
|
@ -45,6 +45,13 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
self.first_page = task_obj.get('first_page', 1)
|
|
|
|
|
self._records_found = 0
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def tolist(datas) -> list:
|
|
|
|
|
if isinstance(datas, (list, tuple, set)):
|
|
|
|
|
return list(set(datas))
|
|
|
|
|
else:
|
|
|
|
|
raise TypeError("不支持的类型:%s" % (type(datas)))
|
|
|
|
|
|
|
|
|
|
async def start(self):
|
|
|
|
|
full_query = self.query_content
|
|
|
|
|
if self.query_condition is not None:
|
|
|
|
|
@ -62,9 +69,9 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
async def parse(self, response: JsonResponse, **kwargs):
|
|
|
|
|
meta = response.meta
|
|
|
|
|
request: scrapy.Request = response.request
|
|
|
|
|
task_query_id = self.query_id
|
|
|
|
|
task_org_id = self.org_id
|
|
|
|
|
task_record_id = self.record_id
|
|
|
|
|
task_query_id: int = self.query_id
|
|
|
|
|
task_org_id: list = self.org_id
|
|
|
|
|
task_record_id: int = self.record_id
|
|
|
|
|
|
|
|
|
|
if response.status != 200:
|
|
|
|
|
self.logger.warning("""
|
|
|
|
|
@ -97,7 +104,7 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
relation_item = WosIdRelationItem()
|
|
|
|
|
relation_item['third_id'] = record.get("uid")
|
|
|
|
|
relation_item['query_ids'] = [task_query_id]
|
|
|
|
|
relation_item['school_ids'] = [task_org_id]
|
|
|
|
|
relation_item['school_ids'] = task_org_id
|
|
|
|
|
relation_item['task_ids'] = [task_record_id]
|
|
|
|
|
relation_item['updated_at'] = batch_time
|
|
|
|
|
yield relation_item
|
|
|
|
|
|