From f977b8ad512ea4fcf98930a270a7ba70140a01de Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Fri, 17 Oct 2025 14:35:46 +0800 Subject: [PATCH] =?UTF-8?q?add:=20wos=20liteapi=20=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=A2=9E=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.idea/science_article_add.iml | 12 + science_article_add/run.py | 12 + .../science_article_add/__init__.py | 0 .../science_article_add/configs/__init__.py | 0 .../science_article_add/configs/wos.py | 96 ++++++ .../science_article_add/items.py | 20 ++ .../science_article_add/middlewares.py | 113 +++++++ .../science_article_add/models/__init__.py | 0 .../science_article_add/models/wos_model.py | 318 ++++++++++++++++++ .../science_article_add/pipelines.py | 59 ++++ .../scripts/get_db_task.py | 49 +++ .../science_article_add/settings.py | 94 ++++++ .../science_article_add/spiders/__init__.py | 4 + .../spiders/wos_latest_increment.py | 138 ++++++++ .../science_article_add/utils/__init__.py | 0 .../science_article_add/utils/tools.py | 8 + science_article_add/scrapy.cfg | 11 + 17 files changed, 934 insertions(+) create mode 100644 science_article_add/.idea/science_article_add.iml create mode 100644 science_article_add/run.py create mode 100644 science_article_add/science_article_add/__init__.py create mode 100644 science_article_add/science_article_add/configs/__init__.py create mode 100644 science_article_add/science_article_add/configs/wos.py create mode 100644 science_article_add/science_article_add/items.py create mode 100644 science_article_add/science_article_add/middlewares.py create mode 100644 science_article_add/science_article_add/models/__init__.py create mode 100644 science_article_add/science_article_add/models/wos_model.py create mode 100644 science_article_add/science_article_add/pipelines.py create mode 100644 science_article_add/science_article_add/scripts/get_db_task.py create mode 100644 science_article_add/science_article_add/settings.py create mode 100644 science_article_add/science_article_add/spiders/__init__.py create mode 100644 science_article_add/science_article_add/spiders/wos_latest_increment.py create mode 100644 science_article_add/science_article_add/utils/__init__.py create mode 100644 science_article_add/science_article_add/utils/tools.py create mode 100644 science_article_add/scrapy.cfg diff --git a/science_article_add/.idea/science_article_add.iml b/science_article_add/.idea/science_article_add.iml new file mode 100644 index 0000000..ecabfc4 --- /dev/null +++ b/science_article_add/.idea/science_article_add.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/science_article_add/run.py b/science_article_add/run.py new file mode 100644 index 0000000..9ddc6bf --- /dev/null +++ b/science_article_add/run.py @@ -0,0 +1,12 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from science_article_add.scripts.get_db_task import TaskManager + +tm = TaskManager() +process = CrawlerProcess(get_project_settings()) + +task = tm.get_task_from_mysql() + +process.crawl('wos_latest_increment', task_obj=task) +process.start() diff --git a/science_article_add/science_article_add/__init__.py b/science_article_add/science_article_add/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_add/science_article_add/configs/__init__.py b/science_article_add/science_article_add/configs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_add/science_article_add/configs/wos.py b/science_article_add/science_article_add/configs/wos.py new file mode 100644 index 0000000..b3b1ad0 --- /dev/null +++ b/science_article_add/science_article_add/configs/wos.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/1/16 8:41 +# @Author : zhaoxiangpeng +# @File : config.py + +from datetime import datetime + +# 数据来源名 +SOURCE_NAME = 'wos' + +WOS_SEARCH_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch" +WOS_DETAIL_LINK = 'https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}' +WOS_DETAIL_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch' + +WOS_ADVANCED_SEARCH_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch' +WOS_EXPORT_FILE_API = 'https://webofscience.clarivate.cn/api/wosnx/indic/export/saveToFile' + +WOS_RECORD_STREAM_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream" +WOS_REFINE_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryRefine" + +# WOS starter api +WOS_STARTER_DOCUMENT_UID_API = "https://api.clarivate.com/apis/wos-starter/v1/documents/{uid}" # Unique Identifier/Accession Number +WOS_STARTER_DOCUMENT_API = "https://api.clarivate.com/apis/wos-starter/v1/documents" +WOS_STARTER_PER_PAGE_LIMIT = 50 # 每页限制的数量 + +# WOS lite api +WOS_LITE_QUERY_FIRST_API = 'https://wos-api.clarivate.com/api/woslite' # 第一个请求,请求后会有一个query的序号 +WOS_LITE_QUERY_API = 'https://wos-api.clarivate.com/api/woslite/query' # 使用序号进行翻页 + +# 发文表 +WOS_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME) +# 被引量集合 +WOS_CITED_NUMBER_COLLECTION = "relation_cited_number_{}".format(SOURCE_NAME) +# 发文关系表 +SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME) +# 参考文献集合 +WOS_REFERENCE_COLLECTION = "relation_reference_{}".format(SOURCE_NAME) +# 待下载Id表 +ARTICLE_TODO_IDS_COLLECTION = "todo_ids_{}".format(SOURCE_NAME) + +# CSCD来源的发文表 +WOS_CSCD_ARTICLE_COLLECTION = 'data_{}_article_{}'.format(SOURCE_NAME, 'cscd') + +# cookie池配置 +# COOKIE_POOL_CONFIG = dict(host=setting.REDIS_HOST, port=6379, db=setting.REDIS_DB, password=setting.REDIS_PASSWORD) +COOKIE_POOL_GROUP = 'cookies_pool:wos:sid*' +COOKIE_POOL_KEY = 'cookies_pool:wos:sid-sjtu' +COOKIE_TTL = 60 * 60 * 4 + +# 下载的单个文件的大小 +BATCH_DOWNLOAD_LIMIT = 500 +# 导出文件时的默认值 +DEFAULT_EXPORT_RECORD_FILTER = "fullRecordPlus" # fullRecordPlus + +# 表头验证配置 +SUCCESS_TABLE_HEAD_START = b'\xef\xbb\xbfPT' +LOST_TABLE_HEAD_START = b'\xef\xbb\xbfnull' +AUTO_TABLE_HEAD_START = b'\xef\xbb\xbfPT\tAU\tBA\tBE\tGP\tAF\tBF\tCA\tTI\tSO\tSE\tBS\tLA\tDT\tCT\tCY\tCL\tSP\tHO\tDE\tID\tAB\tC1\tC3\tRP\tEM\tRI\tOI\tFU\tFP\tFX\tCR\tNR\tTC\tZ9\tU1\tU2\tPU\tPI\tPA\tSN\tEI\tBN\tJ9\tJI\tPD\tPY\tVL\tIS\tPN\tSU\tSI\tMA\tBP\tEP\tAR\tDI\tDL\tD2\tEA\tPG\tWC\tWE\tSC\tGA\tPM\tOA\tHC\tHP\tDA\tUT\r\n' + + +CORE_NAME_TABLE = dict( + WOSCC="Web of Science Core Collection", + BCI="BIOSIS Citation Index", + SCIELO="SciELO Citation Index", + RSCI="Russian Science Citation Index", + CSCD="Chinese Science Citation Database℠", + ARCI="Arabic Citation Index", + DIIDW="Derwent Innovations Index", + PPRN="", + PQDT="ProQuest ™ Dissertations & Theses Citation Index" +) +NAV_NAME_TABLE = dict( + SCI="Science Citation Index Expanded (SCI-Expanded)", + ESCI="Emerging Sources Citation Index (ESCI)", + SSCI="Social Sciences Citation Index (SSCI)", + ISTP="Conference Proceedings Citation Index – Science (CPCI-S)", + BSCI="Book Citation Index – Science (BKCI-S)", + AHCI="Arts & Humanities Citation Index (A&HCI)", + IC="Index Chemicus (IC)", + ISSHP="Conference Proceedings Citation Index – Social Sciences & Humanities (CPCI-SSH)" +) + +TASK_CONFIG = { + "school_id": 83, + "school_name": "北京林业大学", + "search_policy": """OG=(Beijing Forestry University)""", + "crawl_year": [2021, 2022, 2023], + "source_type": 1, + "priority": 10, + "is_important": 1, + "update_interval": 60 * 60 * 24 * 14, + "create_time": datetime.now(), + "last_time": datetime.now(), + "next_time": datetime.now(), + "state": 0 +} diff --git a/science_article_add/science_article_add/items.py b/science_article_add/science_article_add/items.py new file mode 100644 index 0000000..9ac61e7 --- /dev/null +++ b/science_article_add/science_article_add/items.py @@ -0,0 +1,20 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScienceArticleAddItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + third_id = scrapy.Field() + updated_at = scrapy.Field() + + +class WosLiteAddItem(ScienceArticleAddItem): + year = scrapy.Field() + query_ids = scrapy.Field() + school_ids = scrapy.Field() + task_ids = scrapy.Field() diff --git a/science_article_add/science_article_add/middlewares.py b/science_article_add/science_article_add/middlewares.py new file mode 100644 index 0000000..7a4e77b --- /dev/null +++ b/science_article_add/science_article_add/middlewares.py @@ -0,0 +1,113 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class ScienceArticleAddSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # maching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ScienceArticleAddDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class WosLiteApiXkeyDownloaderMiddleware: + async def process_request(self, request, spider): + key_param = { + 'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4' + } + if not request.headers: + request.headers = key_param + return request + + request.headers.update(key_param) + return request diff --git a/science_article_add/science_article_add/models/__init__.py b/science_article_add/science_article_add/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_add/science_article_add/models/wos_model.py b/science_article_add/science_article_add/models/wos_model.py new file mode 100644 index 0000000..8e4bff6 --- /dev/null +++ b/science_article_add/science_article_add/models/wos_model.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/7/13 9:40 +# @Author : zhaoxiangpeng +# @File : model.py + +import json +import enum +import warnings +from typing import List, Tuple, Any, Dict, Union +from urllib.parse import urlencode + +from science_article_add.configs import wos as config + +false = False +true = True +null = None + + +class WosDB(enum.Enum): + WOS = 1 + CSCD = 2 + + +class AnalyzesEnum(enum.Enum): + WOSCC = ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", "DR.Value.6", "ECR.Value.6", + "PY.Field_D.6", "DT.Value.6", "AU.Value.6", "DX2NG.Value.6", "PEERREVIEW.Value.6"] + CSCD = ["TP.Value.6", "DR.Value.6", "OA.Value.6", "PY.Field_D.6", "DT.Value.6", "SJ.Value.6", "AU.Value.6", + "OG.Value.6", "SO.Value.6"] + + +ColNameMap = dict(WOS='WOSCC', CSCD='CSCD') + + +def calculate_next_page(next_page: int = 1, page_size: int = 100): + """ + 计算下一页的游标,即记录的序号 + :param next_page: 下一页的页码 + :param page_size: 每页的大小 + :return: + """ + return (next_page - 1) * page_size + 1 + + +def lite_base_model(usr_query: str, db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs): + if db_id is None: + db_id = 1 + if first_record > 1e5: + warnings.warn('first_record 必须在 1 ~ 100000 之间') + model = { + 'databaseId': WosDB(db_id).name, + 'firstRecord': first_record, + 'count': page_size, + 'usrQuery': usr_query + } + # return urlencode(model) + return model + + +def lite_query_model(db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs): + if db_id is None: + db_id = 1 + model = { + 'databaseId': WosDB(db_id).name, + 'firstRecord': first_record, + 'count': page_size, + } + return urlencode(model) + + +def starter_documents_uid_get(uid, detail: str = None): + """ + + :param uid: + :param detail: + :return: + """ + _query_params: List[Tuple[str, str]] = [] + if detail is not None: + _query_params.append(("detail", detail)) + + +def starter_documents_get(q, db: WosDB = WosDB.WOS.name, limit: int = config.WOS_STARTER_PER_PAGE_LIMIT, page: int = 1, sort_field: str = None, + modified_time_span=None, tc_modified_time_span=None, detail=None, **kwargs): + """ + :param q: + :param db: + :param limit: 最大为50 + :param page: 当limit为50时,范围为1~2000,也就是最多10w条 + :param sort_field: + :param modified_time_span: + :param tc_modified_time_span: + :param detail: 默认全部数据,如果值为short,返回较少的字段(uid, links{record,citingArticles,references,related}, citations[{db,count}], identifiers{doi,issn}) + :param kwargs: + :return: + """ + _query_params: List[Tuple[str, str]] = [] + _query_params.append(("q", q)) + if db: pass + _query_params.append(("db", db)) + _query_params.append(("limit", limit)) + _query_params.append(("page", page)) + if detail is not None: + _query_params.append(("detail", detail)) + return _query_params + + +def make_advanced_search_ut(query: str = None, wos_ids: List = None, limit: int = 50, col_name: str = "WOS") -> Dict[ + str, Any]: + if query is None: + if wos_ids is None: + raise ValueError('query 和 wos_ids 必须满足其中一个不为None') + query = ' OR '.join([f'UT=({wos_id})' for wos_id in wos_ids]) + # 通过一个自定义的名字去拿核心 + product = ColNameMap[col_name] + model = { + "product": product, + "searchMode": "general", + "viewType": "search", + "serviceMode": "summary", + "search": { + "mode": "general", + "database": product, + "query": [ + { + "rowText": query + } + ], + "sets": [], + "options": { + "lemmatize": "On" + } + }, + "retrieve": { + "count": limit, + "history": True, + "jcr": True, + "sort": "relevance", + "analyzes": getattr(AnalyzesEnum, product).value + }, + "eventMode": None, + "isPreprintReview": False + } + return model + + +def export_search_data_to_txt( + q_id: str, + mark_from: int = 1, + mark_to: int = 500, + col_name: str = "WOS", + filters: str = config.DEFAULT_EXPORT_RECORD_FILTER +) -> Dict[str, Any]: + """ + 导出搜索到的记录 + :param q_id: 通过检索得到的检索结果id + :param mark_from: 记录开始,包含 + :param mark_to: 记录结束,包含 + :param col_name: 来源库/核心 + :param filters: fullRecord(完整记录)/fullRecordPlus(完整记录和参考文献) + :return: + """ + if mark_to - mark_from > 500: + mark_to = mark_from + 499 + model = {"parentQid": q_id, "sortBy": "relevance", + "displayTimesCited": "true", "displayCitedRefs": "true", "product": "UA", "colName": col_name, + "displayUsageInfo": "true", "fileOpt": "othersoftware", "action": "saveToTab", + "markFrom": str(mark_from), "markTo": str(mark_to), + "view": "summary", "isRefQuery": "false", "locale": "zh_CN", "filters": filters} + return model + + +def article_detail_model(uts: Union[List[str], str], core: str = "WOSCC"): + """ + 详情 https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id} + 接口 https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch + :param uts: + :param core: + :return: + """ + if isinstance(uts, str): + uts = [uts] + model = { + "eventMode": null, + "isPreprintReview": false, + "product": core, + "retrieve": { + "first": 1, "links": "retrieve", "sort": "relevance", "count": 1, "view": "super", + "coll": null, "activity": false, "analyzes": null, "jcr": true, "reviews": true, + "highlight": null, + "secondaryRetrieve": { + "associated_data": { + "sort": "relevance", "count": 10 + }, + "cited_references": { + "sort": "author-ascending", "count": 30 + }, + "citing_article": { + "sort": "date", "count": 2, "links": null, "view": "mini" + }, + "cited_references_with_context": { + "sort": "date", "count": 135, "view": "mini" + }, + "recommendation_articles": { + "sort": "recommendation-relevance", "count": 5, "links": null, "view": "mini" + }, + "grants_to_wos_records": { + "sort": "date-descending", "count": 30, "links": null, "view": "mini" + } + } + }, + "search": { + "database": core, + "mode": "record_ids", + "uts": uts + }, + "searchMode": "record_ids", + "viewType": "search", + "serviceMode": "summary", + } + return model + + +# 被引用专用model +def get_wos_core_cites( + uts_or_qid: str, + year_range: tuple = None, + core: str = "WOSCC", + parent_db: str = "WOSCC", + is_refine: bool = False +): + """ + https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch + :param uts_or_qid: + :param year_range: 筛选的年份范围 + :param core: 检索的数据库 + :param parent_db: + :param is_refine: 是否是精炼检索 + :return: + """ + model = { + "eventMode": null, + "isPreprintReview": false, + "product": core, + + "search": {"database": core, "mode": "citing_article", "parentDatabase": parent_db, + "parentDoc": null, + "parentId": {"type": "colluid", "value": uts_or_qid}, + "parentQid": null, "parentSort": null}, + # "retrieve": { + # "sort": "date-descending", + # "count": 50, + # "jcr": true, + # "history": true, + # "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", + # "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6", + # "DX2NG.Value.6", "PEERREVIEW.Value.6"] + # }, + + "searchMode": "citing_article", + "serviceMode": "summary", + "viewType": "search", + } + refines = [] + if year_range: + is_refine = True + years = list(range(*year_range)) + [year_range[-1]] + refines.append(dict( + index="PY", value=[str(year) for year in years] + )) + len(refines) and model.update({"refines": refines}) + if is_refine: + model.setdefault("qid", uts_or_qid) + model.pop("search") + model.pop("isPreprintReview") + model.update(viewType="refine") + return model + + +def get_aggregation_wos_cited(q_id: str, core: str = "WOSCC"): + """ + 获取各核心引用的聚合 + https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream + """ + model = { + "product": core, + "qid": q_id, + "retrieve": { + "analyzes": ["EDN.Value.200"] + }, + "searchMode": "citing_article", + "viewType": "records" + } + return model + + +def get_refine_count(q_id: str, count: int = 5): + model = { + "eventMode": null, + "product": "WOSCC", + "qid": q_id, + "refines": [ + {"index": "EDN", "value": ["WOS.SCI", "WOS.SSCI", "WOS.AHCI"]} + ], + # "retrieve": { + # "count": count, "sort": "date-descending", "history": true, "jcr": true, + # "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", + # "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6", + # "DX2NG.Value.6", "PEERREVIEW.Value.6"] + # }, + "searchMode": "citing_article", + "serviceMode": "summary", + "viewType": "refine", + } + return model + + +if __name__ == '__main__': + m1 = lite_base_model(WosDB.WOS) diff --git a/science_article_add/science_article_add/pipelines.py b/science_article_add/science_article_add/pipelines.py new file mode 100644 index 0000000..acd0b30 --- /dev/null +++ b/science_article_add/science_article_add/pipelines.py @@ -0,0 +1,59 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +import pymongo + + +class ScienceAddBufferPipeline: + def __init__(self, buffer_max_size: int = 100): + self.buffer = [] + self.buffer_size = 0 + self.buffer_max_size = buffer_max_size + + @classmethod + def from_crawler(cls, crawler): + return cls( + buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100), + ) + + def process_item(self, item, spider): + self.buffer.append(item) + return item + + def close_spider(self, spider): + self.buffer.clear() + + +class ScienceArticleAddPipeline(ScienceAddBufferPipeline): + def __init__(self, mongo_uri, mongo_db, buffer_max_size): + super().__init__(buffer_max_size=buffer_max_size) + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + self.client = None + self.db = None + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100), + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + super().process_item(item, spider) + if self.buffer_size >= self.buffer_max_size: + self.buffer.clear() + return item diff --git a/science_article_add/science_article_add/scripts/get_db_task.py b/science_article_add/science_article_add/scripts/get_db_task.py new file mode 100644 index 0000000..a613c1f --- /dev/null +++ b/science_article_add/science_article_add/scripts/get_db_task.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/9/15 14:36 +# @Author : zhaoxiangpeng +# @File : get_db_task.py + +import pymysql + +FULL_QUERY = """ +SELECT r.%(org_id)s, r.%(org_name)s, r.%(query_id)s, q.%(content)s, q.%(source_type)s +FROM task_search_strategy AS q JOIN relation_org_query AS r ON r.query_id = q.id +WHERE q.id = %(q_id)s +""" +STRATEGY_FIELDS = ['org_id', 'org_name', 'query_id', 'content', 'source_type'] + + +class TaskManager: + def __init__(self): + self.client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306, + database='science_data_dept', user='science-data-dept', + passwd='datadept1509', ) + + def get_task_from_mysql(self): + cursor = self.client.cursor() + record_fields = ['id', 'batch_date', 'query_id', 'task_condition', 'is_done'] + sql = "select %(fields)s from task_batch_record" % {'fields': ', '.join(record_fields)} + try: + cursor.execute(sql) + result = cursor.fetchone() + task_record_dic = dict(zip(record_fields, result)) + fill = dict(zip(STRATEGY_FIELDS, STRATEGY_FIELDS)) + fill.update(q_id=task_record_dic.get("query_id")) + cursor.execute( + FULL_QUERY % fill, + ) + result = cursor.fetchone() + task_dic = dict(zip(STRATEGY_FIELDS, result)) + task_dic.update(task_record_dic) + except Exception as exc: + raise exc + else: + print(task_dic) + return task_dic + finally: + cursor.close() + + +if __name__ == '__main__': + tm = TaskManager() + tm.get_task_from_mysql() diff --git a/science_article_add/science_article_add/settings.py b/science_article_add/science_article_add/settings.py new file mode 100644 index 0000000..d2f6a35 --- /dev/null +++ b/science_article_add/science_article_add/settings.py @@ -0,0 +1,94 @@ +# Scrapy settings for science_article_add project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "science_article_add" + +SPIDER_MODULES = ["science_article_add.spiders"] +NEWSPIDER_MODULE = "science_article_add.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "science_article_add (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 1 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "science_article_add.middlewares.ScienceArticleAddSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +RETRY_ENABLED = True +RETRY_TIMES = 2 # 重试3次 +# RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码 +#DOWNLOADER_MIDDLEWARES = { +# "science_article_add.middlewares.ScienceArticleAddDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "science_article_add.pipelines.ScienceArticleAddPipeline": 300, +#} +MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/" +MONGO_DATABASE = "science2" + +REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/science_article_add/science_article_add/spiders/__init__.py b/science_article_add/science_article_add/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/science_article_add/science_article_add/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/science_article_add/science_article_add/spiders/wos_latest_increment.py b/science_article_add/science_article_add/spiders/wos_latest_increment.py new file mode 100644 index 0000000..0f61b36 --- /dev/null +++ b/science_article_add/science_article_add/spiders/wos_latest_increment.py @@ -0,0 +1,138 @@ +import math +from urllib.parse import urlencode +from copy import deepcopy +import scrapy + +from science_article_add.items import WosLiteAddItem +from science_article_add.models import wos_model as model +from science_article_add.configs import wos as config +from science_article_add.utils import tools + + +def calculate_next_page(next_page: int = 1, page_size: int = 100): + return (next_page - 1) * page_size + 1 + + +class WosLatestIncrementSpider(scrapy.Spider): + name = "wos_latest_increment" + # allowed_domains = ["wos-api.clarivate.com"] + # start_urls = ["https://wos-api.clarivate.com/api/woslite"] + custom_settings = dict( + DOWNLOADER_MIDDLEWARES={ + "science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500 + }, + ITEM_PIPELINES={ + "science_article_add.pipelines.ScienceAddBufferPipeline": 300, + } + ) + + def __init__(self, task_obj): + scrapy.Spider.__init__(self) + self.task_obj = task_obj + self.record_id = task_obj['id'] + self.org_id = task_obj['org_id'] + self.org_name = task_obj['org_name'] + self.query_id = task_obj['query_id'] + self.query_content = task_obj['content'] + self.query_condition = task_obj['task_condition'] + + async def start(self): + full_query = self.query_content + if self.query_condition is not None: + full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition} + yield scrapy.Request(url=config.WOS_LITE_QUERY_FIRST_API + '?' + urlencode(model.lite_base_model(usr_query=full_query)), + dont_filter=True, + meta={'query': full_query, 'PAGE': 1}) + + async def parse(self, response, **kwargs): + meta = response.meta + request = response.request + task_query_id = self.query_id + task_org_id = self.org_id + task_record_id = self.record_id + self.logger.debug('%s: %s' % ('parse_query_api', meta)) + + resp_result = response.json() + + query_result = resp_result.get('QueryResult') + datas = resp_result.get('Data') + + query_id = query_result.get('QueryID') + records_found = query_result.get('RecordsFound') + max_page = math.ceil(records_found / 100) + meta_copy: dict = deepcopy(meta) + meta_copy.update({'MAX_PAGE': max_page}) + meta_copy.update({'TOTAL': records_found}) + meta_copy.update({'QUERY_ID': query_id}) + meta_copy.update({'next_page': meta['PAGE'] + 1}) + meta_copy.update({'PAGE': meta['PAGE'] + 1}) + meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])}) + + for data in datas: + add_item = WosLiteAddItem() + # 入库年份优先按照自己指定的 + to_db_year = meta.get("search_year") + if not to_db_year: + publish_year = data.get("Source", {}).get("Published.BiblioYear", []) + if publish_year: + to_db_year = tools.str2int(publish_year[0]) + add_item["third_id"] = data.get('UT') + add_item["year"] = to_db_year + add_item["query_ids"] = [task_query_id] + add_item["school_ids"] = [task_org_id] + add_item["task_ids"] = [task_record_id] + yield add_item + + yield scrapy.Request( + url=config.WOS_LITE_QUERY_API + f'/{query_id}', + body=model.lite_query_model(**meta_copy), + meta=meta_copy, + callback=self.parse_query_api, + ) + + async def parse_query_api(self, response, **kwargs): + meta = response.meta + task_query_id = self.query_id + task_org_id = self.org_id + task_record_id = self.record_id + self.logger.debug(""" + %s + %s""" % ('parse_query_api', meta)) + + resp_result = response.json() + query_id = meta.get('QUERY_ID') + + datas = resp_result.get('Data', []) + if len(datas): + for data in datas: + add_item = WosLiteAddItem() + # 入库年份优先按照自己指定的 + to_db_year = meta.get("search_year") + if not to_db_year: + publish_year = data.get("Source", {}).get("Published.BiblioYear", []) + if publish_year: + to_db_year = tools.str2int(publish_year[0]) + add_item["third_id"] = data.get('UT') + add_item["year"] = to_db_year + add_item["query_ids"] = [task_query_id] + add_item["school_ids"] = [task_org_id] + add_item["task_ids"] = [task_record_id] + yield add_item + else: + # 根据条件记录生成sql记录结果 + update_state_condition = ('batch_date = "%(batch_date)s"\n' + 'query_id = %(query_id)s\n' + 'task_condition = "%(task_condition)s"') % self.task_obj + self.logger.warning('没有数据了\n%s' % update_state_condition) + return + if meta['first_record'] < meta['TOTAL']: + meta_copy = deepcopy(meta) + meta_copy.update({'next_page': meta['PAGE'] + 1}) + meta_copy.update({'PAGE': meta['PAGE'] + 1}) + meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])}) + yield scrapy.Request( + url=config.WOS_LITE_QUERY_API + f'/{query_id}', + body=model.lite_query_model(**meta_copy), + meta=meta_copy, + callback=self.parse_query_api, + ) diff --git a/science_article_add/science_article_add/utils/__init__.py b/science_article_add/science_article_add/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_add/science_article_add/utils/tools.py b/science_article_add/science_article_add/utils/tools.py new file mode 100644 index 0000000..599b300 --- /dev/null +++ b/science_article_add/science_article_add/utils/tools.py @@ -0,0 +1,8 @@ +def str2int(val, replace=0): + try: + val = int(val) + except ValueError: + val = replace + except TypeError: + val = replace + return val \ No newline at end of file diff --git a/science_article_add/scrapy.cfg b/science_article_add/scrapy.cfg new file mode 100644 index 0000000..c806f7a --- /dev/null +++ b/science_article_add/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = science_article_add.settings + +[deploy] +#url = http://localhost:6800/ +project = science_article_add