|
|
|
|
@ -6,7 +6,7 @@ from copy import deepcopy
|
|
|
|
|
import scrapy
|
|
|
|
|
from scrapy.http.response.json import JsonResponse
|
|
|
|
|
|
|
|
|
|
from science_article_add.items.wos import WosCitedNumberItem
|
|
|
|
|
from science_article_add.items.wos import WosCitedNumberItem, WosIdRelationItem
|
|
|
|
|
from science_article_add.models import wos_model as model
|
|
|
|
|
from science_article_add.configs import wos as config
|
|
|
|
|
from science_article_add.utils import tools
|
|
|
|
|
@ -22,12 +22,15 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
# start_urls = ["https://wos-api.clarivate.com/api/woslite"]
|
|
|
|
|
custom_settings = dict(
|
|
|
|
|
DOWNLOADER_MIDDLEWARES={
|
|
|
|
|
"science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500
|
|
|
|
|
"science_article_add.middlewares.wos.WosStarterApiXkeyDownloaderMiddleware": 500
|
|
|
|
|
},
|
|
|
|
|
ITEM_PIPELINES={
|
|
|
|
|
"science_article_add.pipelines.ScienceAddBufferPipeline": 300,
|
|
|
|
|
}
|
|
|
|
|
"science_article_add.pipelines.mongo_pipeline.MongoPipeline": 300,
|
|
|
|
|
"science_article_add.pipelines.duptodo.DupTodoPipeline": 400,
|
|
|
|
|
},
|
|
|
|
|
LOG_LEVEL="INFO"
|
|
|
|
|
)
|
|
|
|
|
source = "wos"
|
|
|
|
|
|
|
|
|
|
def __init__(self, task_obj):
|
|
|
|
|
scrapy.Spider.__init__(self)
|
|
|
|
|
@ -44,9 +47,15 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
async def start(self):
|
|
|
|
|
full_query = self.query_content
|
|
|
|
|
if self.query_condition is not None:
|
|
|
|
|
full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition}
|
|
|
|
|
full_query = '%(query)s%(condition)s' % {
|
|
|
|
|
'query': f'({self.query_content})' if self.query_condition else self.query_content,
|
|
|
|
|
'condition': ' ' + self.query_condition if self.query_condition else ''
|
|
|
|
|
}
|
|
|
|
|
self.logger.info(f'full_query: {full_query}')
|
|
|
|
|
meta = dict(q=full_query, page=self.first_page, limit=50, detail="short")
|
|
|
|
|
yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta)),
|
|
|
|
|
params = model.starter_documents_get(**meta)
|
|
|
|
|
enc_params = urlencode(params, doseq=True)
|
|
|
|
|
yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + enc_params,
|
|
|
|
|
meta=meta)
|
|
|
|
|
|
|
|
|
|
async def parse(self, response: JsonResponse, **kwargs):
|
|
|
|
|
@ -55,7 +64,6 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
task_query_id = self.query_id
|
|
|
|
|
task_org_id = self.org_id
|
|
|
|
|
task_record_id = self.record_id
|
|
|
|
|
self.logger.debug('%s: %s' % ('parse_query_api', meta))
|
|
|
|
|
|
|
|
|
|
if response.status != 200:
|
|
|
|
|
self.logger.warning("""
|
|
|
|
|
@ -73,19 +81,35 @@ class WosLatestIncrementSpider(scrapy.Spider):
|
|
|
|
|
self.logger.info("""
|
|
|
|
|
检索式: %s
|
|
|
|
|
检索到结果: %s""" % (req_meta.get("q"), records_found))
|
|
|
|
|
self.set_records_found(records_found)
|
|
|
|
|
max_page = req_meta["MAX_PAGE"] = math.ceil(records_found / config.WOS_STARTER_PER_PAGE_LIMIT)
|
|
|
|
|
batch_time = datetime.now()
|
|
|
|
|
hits: list = resp_result.get("hits")
|
|
|
|
|
for record in hits:
|
|
|
|
|
cited_num = tools.get_list_key(array=record.get("citations"), target="count", condition=("db", "WOS"))
|
|
|
|
|
if cited_num:
|
|
|
|
|
cited_item = WosCitedNumberItem(third_id=record.get("uid"), cited=cited_num, updated_at=batch_time)
|
|
|
|
|
cited_item = WosCitedNumberItem()
|
|
|
|
|
cited_item['third_id'] = record.get("uid")
|
|
|
|
|
cited_item['cited'] = cited_num
|
|
|
|
|
cited_item['updated_at'] = batch_time
|
|
|
|
|
yield cited_item
|
|
|
|
|
yield WosIdRelationItem(third_id=record.get("uid"), query_ids=[task_query_id], updated_at=batch_time)
|
|
|
|
|
relation_item = WosIdRelationItem()
|
|
|
|
|
relation_item['third_id'] = record.get("uid")
|
|
|
|
|
relation_item['query_ids'] = [task_query_id]
|
|
|
|
|
relation_item['school_ids'] = [task_org_id]
|
|
|
|
|
relation_item['task_ids'] = [task_record_id]
|
|
|
|
|
relation_item['updated_at'] = batch_time
|
|
|
|
|
yield relation_item
|
|
|
|
|
|
|
|
|
|
if current_page < max_page:
|
|
|
|
|
meta_copy: dict = deepcopy(req_meta)
|
|
|
|
|
meta_copy.update({'page': meta_copy['page'] + 1})
|
|
|
|
|
yield scrapy.Request(config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)),
|
|
|
|
|
meta=meta_copy,
|
|
|
|
|
task_query_id=request.task_query_id)
|
|
|
|
|
yield scrapy.Request(
|
|
|
|
|
config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)),
|
|
|
|
|
meta=meta_copy)
|
|
|
|
|
|
|
|
|
|
def set_records_found(self, val):
|
|
|
|
|
self._records_found = val
|
|
|
|
|
|
|
|
|
|
def get_records_found(self) -> int:
|
|
|
|
|
return self._records_found
|
|
|
|
|
|