add:wos来源使用starterapi采集

main
zhaoxiangpeng 2 months ago
parent 78a76ba9f2
commit e3a23ad33e

@ -6,7 +6,7 @@ from copy import deepcopy
import scrapy
from scrapy.http.response.json import JsonResponse
from science_article_add.items.wos import WosCitedNumberItem
from science_article_add.items.wos import WosCitedNumberItem, WosIdRelationItem
from science_article_add.models import wos_model as model
from science_article_add.configs import wos as config
from science_article_add.utils import tools
@ -22,12 +22,15 @@ class WosLatestIncrementSpider(scrapy.Spider):
# start_urls = ["https://wos-api.clarivate.com/api/woslite"]
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500
"science_article_add.middlewares.wos.WosStarterApiXkeyDownloaderMiddleware": 500
},
ITEM_PIPELINES={
"science_article_add.pipelines.ScienceAddBufferPipeline": 300,
}
"science_article_add.pipelines.mongo_pipeline.MongoPipeline": 300,
"science_article_add.pipelines.duptodo.DupTodoPipeline": 400,
},
LOG_LEVEL="INFO"
)
source = "wos"
def __init__(self, task_obj):
scrapy.Spider.__init__(self)
@ -44,9 +47,15 @@ class WosLatestIncrementSpider(scrapy.Spider):
async def start(self):
full_query = self.query_content
if self.query_condition is not None:
full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition}
full_query = '%(query)s%(condition)s' % {
'query': f'({self.query_content})' if self.query_condition else self.query_content,
'condition': ' ' + self.query_condition if self.query_condition else ''
}
self.logger.info(f'full_query: {full_query}')
meta = dict(q=full_query, page=self.first_page, limit=50, detail="short")
yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta)),
params = model.starter_documents_get(**meta)
enc_params = urlencode(params, doseq=True)
yield scrapy.Request(url=config.WOS_STARTER_DOCUMENT_API + '?' + enc_params,
meta=meta)
async def parse(self, response: JsonResponse, **kwargs):
@ -55,7 +64,6 @@ class WosLatestIncrementSpider(scrapy.Spider):
task_query_id = self.query_id
task_org_id = self.org_id
task_record_id = self.record_id
self.logger.debug('%s: %s' % ('parse_query_api', meta))
if response.status != 200:
self.logger.warning("""
@ -73,19 +81,35 @@ class WosLatestIncrementSpider(scrapy.Spider):
self.logger.info("""
检索式: %s
检索到结果: %s""" % (req_meta.get("q"), records_found))
self.set_records_found(records_found)
max_page = req_meta["MAX_PAGE"] = math.ceil(records_found / config.WOS_STARTER_PER_PAGE_LIMIT)
batch_time = datetime.now()
hits: list = resp_result.get("hits")
for record in hits:
cited_num = tools.get_list_key(array=record.get("citations"), target="count", condition=("db", "WOS"))
if cited_num:
cited_item = WosCitedNumberItem(third_id=record.get("uid"), cited=cited_num, updated_at=batch_time)
cited_item = WosCitedNumberItem()
cited_item['third_id'] = record.get("uid")
cited_item['cited'] = cited_num
cited_item['updated_at'] = batch_time
yield cited_item
yield WosIdRelationItem(third_id=record.get("uid"), query_ids=[task_query_id], updated_at=batch_time)
relation_item = WosIdRelationItem()
relation_item['third_id'] = record.get("uid")
relation_item['query_ids'] = [task_query_id]
relation_item['school_ids'] = [task_org_id]
relation_item['task_ids'] = [task_record_id]
relation_item['updated_at'] = batch_time
yield relation_item
if current_page < max_page:
meta_copy: dict = deepcopy(req_meta)
meta_copy.update({'page': meta_copy['page'] + 1})
yield scrapy.Request(config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)),
meta=meta_copy,
task_query_id=request.task_query_id)
yield scrapy.Request(
config.WOS_STARTER_DOCUMENT_API + '?' + urlencode(model.starter_documents_get(**meta_copy)),
meta=meta_copy)
def set_records_found(self, val):
self._records_found = val
def get_records_found(self) -> int:
return self._records_found

Loading…
Cancel
Save