add: wos liteapi 数据增量

main
zhaoxiangpeng 3 weeks ago
parent b14a503758
commit f977b8ad51

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="pydevenv" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

@ -0,0 +1,12 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from science_article_add.scripts.get_db_task import TaskManager
tm = TaskManager()
process = CrawlerProcess(get_project_settings())
task = tm.get_task_from_mysql()
process.crawl('wos_latest_increment', task_obj=task)
process.start()

@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
# @Time : 2024/1/16 8:41
# @Author : zhaoxiangpeng
# @File : config.py
from datetime import datetime
# 数据来源名
SOURCE_NAME = 'wos'
WOS_SEARCH_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch"
WOS_DETAIL_LINK = 'https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}'
WOS_DETAIL_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
WOS_ADVANCED_SEARCH_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
WOS_EXPORT_FILE_API = 'https://webofscience.clarivate.cn/api/wosnx/indic/export/saveToFile'
WOS_RECORD_STREAM_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream"
WOS_REFINE_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryRefine"
# WOS starter api
WOS_STARTER_DOCUMENT_UID_API = "https://api.clarivate.com/apis/wos-starter/v1/documents/{uid}" # Unique Identifier/Accession Number
WOS_STARTER_DOCUMENT_API = "https://api.clarivate.com/apis/wos-starter/v1/documents"
WOS_STARTER_PER_PAGE_LIMIT = 50 # 每页限制的数量
# WOS lite api
WOS_LITE_QUERY_FIRST_API = 'https://wos-api.clarivate.com/api/woslite' # 第一个请求请求后会有一个query的序号
WOS_LITE_QUERY_API = 'https://wos-api.clarivate.com/api/woslite/query' # 使用序号进行翻页
# 发文表
WOS_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME)
# 被引量集合
WOS_CITED_NUMBER_COLLECTION = "relation_cited_number_{}".format(SOURCE_NAME)
# 发文关系表
SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME)
# 参考文献集合
WOS_REFERENCE_COLLECTION = "relation_reference_{}".format(SOURCE_NAME)
# 待下载Id表
ARTICLE_TODO_IDS_COLLECTION = "todo_ids_{}".format(SOURCE_NAME)
# CSCD来源的发文表
WOS_CSCD_ARTICLE_COLLECTION = 'data_{}_article_{}'.format(SOURCE_NAME, 'cscd')
# cookie池配置
# COOKIE_POOL_CONFIG = dict(host=setting.REDIS_HOST, port=6379, db=setting.REDIS_DB, password=setting.REDIS_PASSWORD)
COOKIE_POOL_GROUP = 'cookies_pool:wos:sid*'
COOKIE_POOL_KEY = 'cookies_pool:wos:sid-sjtu'
COOKIE_TTL = 60 * 60 * 4
# 下载的单个文件的大小
BATCH_DOWNLOAD_LIMIT = 500
# 导出文件时的默认值
DEFAULT_EXPORT_RECORD_FILTER = "fullRecordPlus" # fullRecordPlus
# 表头验证配置
SUCCESS_TABLE_HEAD_START = b'\xef\xbb\xbfPT'
LOST_TABLE_HEAD_START = b'\xef\xbb\xbfnull'
AUTO_TABLE_HEAD_START = b'\xef\xbb\xbfPT\tAU\tBA\tBE\tGP\tAF\tBF\tCA\tTI\tSO\tSE\tBS\tLA\tDT\tCT\tCY\tCL\tSP\tHO\tDE\tID\tAB\tC1\tC3\tRP\tEM\tRI\tOI\tFU\tFP\tFX\tCR\tNR\tTC\tZ9\tU1\tU2\tPU\tPI\tPA\tSN\tEI\tBN\tJ9\tJI\tPD\tPY\tVL\tIS\tPN\tSU\tSI\tMA\tBP\tEP\tAR\tDI\tDL\tD2\tEA\tPG\tWC\tWE\tSC\tGA\tPM\tOA\tHC\tHP\tDA\tUT\r\n'
CORE_NAME_TABLE = dict(
WOSCC="Web of Science Core Collection",
BCI="BIOSIS Citation Index",
SCIELO="SciELO Citation Index",
RSCI="Russian Science Citation Index",
CSCD="Chinese Science Citation Database℠",
ARCI="Arabic Citation Index",
DIIDW="Derwent Innovations Index",
PPRN="",
PQDT="ProQuest ™ Dissertations & Theses Citation Index"
)
NAV_NAME_TABLE = dict(
SCI="Science Citation Index Expanded (SCI-Expanded)",
ESCI="Emerging Sources Citation Index (ESCI)",
SSCI="Social Sciences Citation Index (SSCI)",
ISTP="Conference Proceedings Citation Index Science (CPCI-S)",
BSCI="Book Citation Index Science (BKCI-S)",
AHCI="Arts & Humanities Citation Index (A&HCI)",
IC="Index Chemicus (IC)",
ISSHP="Conference Proceedings Citation Index Social Sciences & Humanities (CPCI-SSH)"
)
TASK_CONFIG = {
"school_id": 83,
"school_name": "北京林业大学",
"search_policy": """OG=(Beijing Forestry University)""",
"crawl_year": [2021, 2022, 2023],
"source_type": 1,
"priority": 10,
"is_important": 1,
"update_interval": 60 * 60 * 24 * 14,
"create_time": datetime.now(),
"last_time": datetime.now(),
"next_time": datetime.now(),
"state": 0
}

@ -0,0 +1,20 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScienceArticleAddItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
third_id = scrapy.Field()
updated_at = scrapy.Field()
class WosLiteAddItem(ScienceArticleAddItem):
year = scrapy.Field()
query_ids = scrapy.Field()
school_ids = scrapy.Field()
task_ids = scrapy.Field()

@ -0,0 +1,113 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScienceArticleAddSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
async def process_start(self, start):
# Called with an async iterator over the spider start() method or the
# maching method of an earlier spider middleware.
async for item_or_request in start:
yield item_or_request
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ScienceArticleAddDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class WosLiteApiXkeyDownloaderMiddleware:
async def process_request(self, request, spider):
key_param = {
'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4'
}
if not request.headers:
request.headers = key_param
return request
request.headers.update(key_param)
return request

@ -0,0 +1,318 @@
# -*- coding: utf-8 -*-
# @Time : 2023/7/13 9:40
# @Author : zhaoxiangpeng
# @File : model.py
import json
import enum
import warnings
from typing import List, Tuple, Any, Dict, Union
from urllib.parse import urlencode
from science_article_add.configs import wos as config
false = False
true = True
null = None
class WosDB(enum.Enum):
WOS = 1
CSCD = 2
class AnalyzesEnum(enum.Enum):
WOSCC = ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", "DR.Value.6", "ECR.Value.6",
"PY.Field_D.6", "DT.Value.6", "AU.Value.6", "DX2NG.Value.6", "PEERREVIEW.Value.6"]
CSCD = ["TP.Value.6", "DR.Value.6", "OA.Value.6", "PY.Field_D.6", "DT.Value.6", "SJ.Value.6", "AU.Value.6",
"OG.Value.6", "SO.Value.6"]
ColNameMap = dict(WOS='WOSCC', CSCD='CSCD')
def calculate_next_page(next_page: int = 1, page_size: int = 100):
"""
计算下一页的游标即记录的序号
:param next_page: 下一页的页码
:param page_size: 每页的大小
:return:
"""
return (next_page - 1) * page_size + 1
def lite_base_model(usr_query: str, db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
if db_id is None:
db_id = 1
if first_record > 1e5:
warnings.warn('first_record 必须在 1 ~ 100000 之间')
model = {
'databaseId': WosDB(db_id).name,
'firstRecord': first_record,
'count': page_size,
'usrQuery': usr_query
}
# return urlencode(model)
return model
def lite_query_model(db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
if db_id is None:
db_id = 1
model = {
'databaseId': WosDB(db_id).name,
'firstRecord': first_record,
'count': page_size,
}
return urlencode(model)
def starter_documents_uid_get(uid, detail: str = None):
"""
:param uid:
:param detail:
:return:
"""
_query_params: List[Tuple[str, str]] = []
if detail is not None:
_query_params.append(("detail", detail))
def starter_documents_get(q, db: WosDB = WosDB.WOS.name, limit: int = config.WOS_STARTER_PER_PAGE_LIMIT, page: int = 1, sort_field: str = None,
modified_time_span=None, tc_modified_time_span=None, detail=None, **kwargs):
"""
:param q:
:param db:
:param limit: 最大为50
:param page: 当limit为50时范围为1~2000也就是最多10w条
:param sort_field:
:param modified_time_span:
:param tc_modified_time_span:
:param detail: 默认全部数据如果值为short返回较少的字段(uid, links{record,citingArticles,references,related}, citations[{db,count}], identifiers{doi,issn})
:param kwargs:
:return:
"""
_query_params: List[Tuple[str, str]] = []
_query_params.append(("q", q))
if db: pass
_query_params.append(("db", db))
_query_params.append(("limit", limit))
_query_params.append(("page", page))
if detail is not None:
_query_params.append(("detail", detail))
return _query_params
def make_advanced_search_ut(query: str = None, wos_ids: List = None, limit: int = 50, col_name: str = "WOS") -> Dict[
str, Any]:
if query is None:
if wos_ids is None:
raise ValueError('query 和 wos_ids 必须满足其中一个不为None')
query = ' OR '.join([f'UT=({wos_id})' for wos_id in wos_ids])
# 通过一个自定义的名字去拿核心
product = ColNameMap[col_name]
model = {
"product": product,
"searchMode": "general",
"viewType": "search",
"serviceMode": "summary",
"search": {
"mode": "general",
"database": product,
"query": [
{
"rowText": query
}
],
"sets": [],
"options": {
"lemmatize": "On"
}
},
"retrieve": {
"count": limit,
"history": True,
"jcr": True,
"sort": "relevance",
"analyzes": getattr(AnalyzesEnum, product).value
},
"eventMode": None,
"isPreprintReview": False
}
return model
def export_search_data_to_txt(
q_id: str,
mark_from: int = 1,
mark_to: int = 500,
col_name: str = "WOS",
filters: str = config.DEFAULT_EXPORT_RECORD_FILTER
) -> Dict[str, Any]:
"""
导出搜索到的记录
:param q_id: 通过检索得到的检索结果id
:param mark_from: 记录开始包含
:param mark_to: 记录结束包含
:param col_name: 来源库/核心
:param filters: fullRecord(完整记录)/fullRecordPlus(完整记录和参考文献)
:return:
"""
if mark_to - mark_from > 500:
mark_to = mark_from + 499
model = {"parentQid": q_id, "sortBy": "relevance",
"displayTimesCited": "true", "displayCitedRefs": "true", "product": "UA", "colName": col_name,
"displayUsageInfo": "true", "fileOpt": "othersoftware", "action": "saveToTab",
"markFrom": str(mark_from), "markTo": str(mark_to),
"view": "summary", "isRefQuery": "false", "locale": "zh_CN", "filters": filters}
return model
def article_detail_model(uts: Union[List[str], str], core: str = "WOSCC"):
"""
详情 https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}
接口 https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
:param uts:
:param core:
:return:
"""
if isinstance(uts, str):
uts = [uts]
model = {
"eventMode": null,
"isPreprintReview": false,
"product": core,
"retrieve": {
"first": 1, "links": "retrieve", "sort": "relevance", "count": 1, "view": "super",
"coll": null, "activity": false, "analyzes": null, "jcr": true, "reviews": true,
"highlight": null,
"secondaryRetrieve": {
"associated_data": {
"sort": "relevance", "count": 10
},
"cited_references": {
"sort": "author-ascending", "count": 30
},
"citing_article": {
"sort": "date", "count": 2, "links": null, "view": "mini"
},
"cited_references_with_context": {
"sort": "date", "count": 135, "view": "mini"
},
"recommendation_articles": {
"sort": "recommendation-relevance", "count": 5, "links": null, "view": "mini"
},
"grants_to_wos_records": {
"sort": "date-descending", "count": 30, "links": null, "view": "mini"
}
}
},
"search": {
"database": core,
"mode": "record_ids",
"uts": uts
},
"searchMode": "record_ids",
"viewType": "search",
"serviceMode": "summary",
}
return model
# 被引用专用model
def get_wos_core_cites(
uts_or_qid: str,
year_range: tuple = None,
core: str = "WOSCC",
parent_db: str = "WOSCC",
is_refine: bool = False
):
"""
https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
:param uts_or_qid:
:param year_range: 筛选的年份范围
:param core: 检索的数据库
:param parent_db:
:param is_refine: 是否是精炼检索
:return:
"""
model = {
"eventMode": null,
"isPreprintReview": false,
"product": core,
"search": {"database": core, "mode": "citing_article", "parentDatabase": parent_db,
"parentDoc": null,
"parentId": {"type": "colluid", "value": uts_or_qid},
"parentQid": null, "parentSort": null},
# "retrieve": {
# "sort": "date-descending",
# "count": 50,
# "jcr": true,
# "history": true,
# "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
# "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
# "DX2NG.Value.6", "PEERREVIEW.Value.6"]
# },
"searchMode": "citing_article",
"serviceMode": "summary",
"viewType": "search",
}
refines = []
if year_range:
is_refine = True
years = list(range(*year_range)) + [year_range[-1]]
refines.append(dict(
index="PY", value=[str(year) for year in years]
))
len(refines) and model.update({"refines": refines})
if is_refine:
model.setdefault("qid", uts_or_qid)
model.pop("search")
model.pop("isPreprintReview")
model.update(viewType="refine")
return model
def get_aggregation_wos_cited(q_id: str, core: str = "WOSCC"):
"""
获取各核心引用的聚合
https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream
"""
model = {
"product": core,
"qid": q_id,
"retrieve": {
"analyzes": ["EDN.Value.200"]
},
"searchMode": "citing_article",
"viewType": "records"
}
return model
def get_refine_count(q_id: str, count: int = 5):
model = {
"eventMode": null,
"product": "WOSCC",
"qid": q_id,
"refines": [
{"index": "EDN", "value": ["WOS.SCI", "WOS.SSCI", "WOS.AHCI"]}
],
# "retrieve": {
# "count": count, "sort": "date-descending", "history": true, "jcr": true,
# "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
# "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
# "DX2NG.Value.6", "PEERREVIEW.Value.6"]
# },
"searchMode": "citing_article",
"serviceMode": "summary",
"viewType": "refine",
}
return model
if __name__ == '__main__':
m1 = lite_base_model(WosDB.WOS)

@ -0,0 +1,59 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymongo
class ScienceAddBufferPipeline:
def __init__(self, buffer_max_size: int = 100):
self.buffer = []
self.buffer_size = 0
self.buffer_max_size = buffer_max_size
@classmethod
def from_crawler(cls, crawler):
return cls(
buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100),
)
def process_item(self, item, spider):
self.buffer.append(item)
return item
def close_spider(self, spider):
self.buffer.clear()
class ScienceArticleAddPipeline(ScienceAddBufferPipeline):
def __init__(self, mongo_uri, mongo_db, buffer_max_size):
super().__init__(buffer_max_size=buffer_max_size)
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client = None
self.db = None
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
super().process_item(item, spider)
if self.buffer_size >= self.buffer_max_size:
self.buffer.clear()
return item

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
# @Time : 2025/9/15 14:36
# @Author : zhaoxiangpeng
# @File : get_db_task.py
import pymysql
FULL_QUERY = """
SELECT r.%(org_id)s, r.%(org_name)s, r.%(query_id)s, q.%(content)s, q.%(source_type)s
FROM task_search_strategy AS q JOIN relation_org_query AS r ON r.query_id = q.id
WHERE q.id = %(q_id)s
"""
STRATEGY_FIELDS = ['org_id', 'org_name', 'query_id', 'content', 'source_type']
class TaskManager:
def __init__(self):
self.client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
database='science_data_dept', user='science-data-dept',
passwd='datadept1509', )
def get_task_from_mysql(self):
cursor = self.client.cursor()
record_fields = ['id', 'batch_date', 'query_id', 'task_condition', 'is_done']
sql = "select %(fields)s from task_batch_record" % {'fields': ', '.join(record_fields)}
try:
cursor.execute(sql)
result = cursor.fetchone()
task_record_dic = dict(zip(record_fields, result))
fill = dict(zip(STRATEGY_FIELDS, STRATEGY_FIELDS))
fill.update(q_id=task_record_dic.get("query_id"))
cursor.execute(
FULL_QUERY % fill,
)
result = cursor.fetchone()
task_dic = dict(zip(STRATEGY_FIELDS, result))
task_dic.update(task_record_dic)
except Exception as exc:
raise exc
else:
print(task_dic)
return task_dic
finally:
cursor.close()
if __name__ == '__main__':
tm = TaskManager()
tm.get_task_from_mysql()

@ -0,0 +1,94 @@
# Scrapy settings for science_article_add project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "science_article_add"
SPIDER_MODULES = ["science_article_add.spiders"]
NEWSPIDER_MODULE = "science_article_add.spiders"
ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "science_article_add (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "science_article_add.middlewares.ScienceArticleAddSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
RETRY_ENABLED = True
RETRY_TIMES = 2 # 重试3次
# RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
#DOWNLOADER_MIDDLEWARES = {
# "science_article_add.middlewares.ScienceArticleAddDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "science_article_add.pipelines.ScienceArticleAddPipeline": 300,
#}
MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
MONGO_DATABASE = "science2"
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,138 @@
import math
from urllib.parse import urlencode
from copy import deepcopy
import scrapy
from science_article_add.items import WosLiteAddItem
from science_article_add.models import wos_model as model
from science_article_add.configs import wos as config
from science_article_add.utils import tools
def calculate_next_page(next_page: int = 1, page_size: int = 100):
return (next_page - 1) * page_size + 1
class WosLatestIncrementSpider(scrapy.Spider):
name = "wos_latest_increment"
# allowed_domains = ["wos-api.clarivate.com"]
# start_urls = ["https://wos-api.clarivate.com/api/woslite"]
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500
},
ITEM_PIPELINES={
"science_article_add.pipelines.ScienceAddBufferPipeline": 300,
}
)
def __init__(self, task_obj):
scrapy.Spider.__init__(self)
self.task_obj = task_obj
self.record_id = task_obj['id']
self.org_id = task_obj['org_id']
self.org_name = task_obj['org_name']
self.query_id = task_obj['query_id']
self.query_content = task_obj['content']
self.query_condition = task_obj['task_condition']
async def start(self):
full_query = self.query_content
if self.query_condition is not None:
full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition}
yield scrapy.Request(url=config.WOS_LITE_QUERY_FIRST_API + '?' + urlencode(model.lite_base_model(usr_query=full_query)),
dont_filter=True,
meta={'query': full_query, 'PAGE': 1})
async def parse(self, response, **kwargs):
meta = response.meta
request = response.request
task_query_id = self.query_id
task_org_id = self.org_id
task_record_id = self.record_id
self.logger.debug('%s: %s' % ('parse_query_api', meta))
resp_result = response.json()
query_result = resp_result.get('QueryResult')
datas = resp_result.get('Data')
query_id = query_result.get('QueryID')
records_found = query_result.get('RecordsFound')
max_page = math.ceil(records_found / 100)
meta_copy: dict = deepcopy(meta)
meta_copy.update({'MAX_PAGE': max_page})
meta_copy.update({'TOTAL': records_found})
meta_copy.update({'QUERY_ID': query_id})
meta_copy.update({'next_page': meta['PAGE'] + 1})
meta_copy.update({'PAGE': meta['PAGE'] + 1})
meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])})
for data in datas:
add_item = WosLiteAddItem()
# 入库年份优先按照自己指定的
to_db_year = meta.get("search_year")
if not to_db_year:
publish_year = data.get("Source", {}).get("Published.BiblioYear", [])
if publish_year:
to_db_year = tools.str2int(publish_year[0])
add_item["third_id"] = data.get('UT')
add_item["year"] = to_db_year
add_item["query_ids"] = [task_query_id]
add_item["school_ids"] = [task_org_id]
add_item["task_ids"] = [task_record_id]
yield add_item
yield scrapy.Request(
url=config.WOS_LITE_QUERY_API + f'/{query_id}',
body=model.lite_query_model(**meta_copy),
meta=meta_copy,
callback=self.parse_query_api,
)
async def parse_query_api(self, response, **kwargs):
meta = response.meta
task_query_id = self.query_id
task_org_id = self.org_id
task_record_id = self.record_id
self.logger.debug("""
%s
%s""" % ('parse_query_api', meta))
resp_result = response.json()
query_id = meta.get('QUERY_ID')
datas = resp_result.get('Data', [])
if len(datas):
for data in datas:
add_item = WosLiteAddItem()
# 入库年份优先按照自己指定的
to_db_year = meta.get("search_year")
if not to_db_year:
publish_year = data.get("Source", {}).get("Published.BiblioYear", [])
if publish_year:
to_db_year = tools.str2int(publish_year[0])
add_item["third_id"] = data.get('UT')
add_item["year"] = to_db_year
add_item["query_ids"] = [task_query_id]
add_item["school_ids"] = [task_org_id]
add_item["task_ids"] = [task_record_id]
yield add_item
else:
# 根据条件记录生成sql记录结果
update_state_condition = ('batch_date = "%(batch_date)s"\n'
'query_id = %(query_id)s\n'
'task_condition = "%(task_condition)s"') % self.task_obj
self.logger.warning('没有数据了\n%s' % update_state_condition)
return
if meta['first_record'] < meta['TOTAL']:
meta_copy = deepcopy(meta)
meta_copy.update({'next_page': meta['PAGE'] + 1})
meta_copy.update({'PAGE': meta['PAGE'] + 1})
meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])})
yield scrapy.Request(
url=config.WOS_LITE_QUERY_API + f'/{query_id}',
body=model.lite_query_model(**meta_copy),
meta=meta_copy,
callback=self.parse_query_api,
)

@ -0,0 +1,8 @@
def str2int(val, replace=0):
try:
val = int(val)
except ValueError:
val = replace
except TypeError:
val = replace
return val

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = science_article_add.settings
[deploy]
#url = http://localhost:6800/
project = science_article_add
Loading…
Cancel
Save