add: wos liteapi 数据增量

3 months ago · f977b8ad51
parent b14a503758
commit f977b8ad51
17 changed files with 934 additions and 0 deletions
--- a/science_article_add/.idea/science_article_add.iml
+++ b/science_article_add/.idea/science_article_add.iml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="pydevenv" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
--- a/science_article_add/run.py
+++ b/science_article_add/run.py
@ -0,0 +1,12 @@
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from science_article_add.scripts.get_db_task import TaskManager
+
+tm = TaskManager()
+process = CrawlerProcess(get_project_settings())
+
+task = tm.get_task_from_mysql()
+
+process.crawl('wos_latest_increment', task_obj=task)
+process.start()
--- a/science_article_add/science_article_add/init.py
+++ b/science_article_add/science_article_add/init.py
--- a/science_article_add/science_article_add/configs/init.py
+++ b/science_article_add/science_article_add/configs/init.py
--- a/science_article_add/science_article_add/configs/wos.py
+++ b/science_article_add/science_article_add/configs/wos.py
@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/1/16 8:41
+# @Author  : zhaoxiangpeng
+# @File    : config.py
+
+from datetime import datetime
+
+# 数据来源名
+SOURCE_NAME = 'wos'
+
+WOS_SEARCH_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch"
+WOS_DETAIL_LINK = 'https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}'
+WOS_DETAIL_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
+
+WOS_ADVANCED_SEARCH_API = 'https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch'
+WOS_EXPORT_FILE_API = 'https://webofscience.clarivate.cn/api/wosnx/indic/export/saveToFile'
+
+WOS_RECORD_STREAM_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream"
+WOS_REFINE_API = "https://webofscience.clarivate.cn/api/wosnx/core/runQueryRefine"
+
+# WOS starter api
+WOS_STARTER_DOCUMENT_UID_API = "https://api.clarivate.com/apis/wos-starter/v1/documents/{uid}"  # Unique Identifier/Accession Number
+WOS_STARTER_DOCUMENT_API = "https://api.clarivate.com/apis/wos-starter/v1/documents"
+WOS_STARTER_PER_PAGE_LIMIT = 50  # 每页限制的数量
+
+# WOS lite api
+WOS_LITE_QUERY_FIRST_API = 'https://wos-api.clarivate.com/api/woslite'  # 第一个请求，请求后会有一个query的序号
+WOS_LITE_QUERY_API = 'https://wos-api.clarivate.com/api/woslite/query'  # 使用序号进行翻页
+
+# 发文表
+WOS_ARTICLE_COLLECTION = 'data_{}_article'.format(SOURCE_NAME)
+# 被引量集合
+WOS_CITED_NUMBER_COLLECTION = "relation_cited_number_{}".format(SOURCE_NAME)
+# 发文关系表
+SCHOOL_RELATION_COLLECTION = 'relation_school_{}'.format(SOURCE_NAME)
+# 参考文献集合
+WOS_REFERENCE_COLLECTION = "relation_reference_{}".format(SOURCE_NAME)
+# 待下载Id表
+ARTICLE_TODO_IDS_COLLECTION = "todo_ids_{}".format(SOURCE_NAME)
+
+# CSCD来源的发文表
+WOS_CSCD_ARTICLE_COLLECTION = 'data_{}_article_{}'.format(SOURCE_NAME, 'cscd')
+
+# cookie池配置
+# COOKIE_POOL_CONFIG = dict(host=setting.REDIS_HOST, port=6379, db=setting.REDIS_DB, password=setting.REDIS_PASSWORD)
+COOKIE_POOL_GROUP = 'cookies_pool:wos:sid*'
+COOKIE_POOL_KEY = 'cookies_pool:wos:sid-sjtu'
+COOKIE_TTL = 60 * 60 * 4
+
+# 下载的单个文件的大小
+BATCH_DOWNLOAD_LIMIT = 500
+# 导出文件时的默认值
+DEFAULT_EXPORT_RECORD_FILTER = "fullRecordPlus"  # fullRecordPlus
+
+# 表头验证配置
+SUCCESS_TABLE_HEAD_START = b'\xef\xbb\xbfPT'
+LOST_TABLE_HEAD_START = b'\xef\xbb\xbfnull'
+AUTO_TABLE_HEAD_START = b'\xef\xbb\xbfPT\tAU\tBA\tBE\tGP\tAF\tBF\tCA\tTI\tSO\tSE\tBS\tLA\tDT\tCT\tCY\tCL\tSP\tHO\tDE\tID\tAB\tC1\tC3\tRP\tEM\tRI\tOI\tFU\tFP\tFX\tCR\tNR\tTC\tZ9\tU1\tU2\tPU\tPI\tPA\tSN\tEI\tBN\tJ9\tJI\tPD\tPY\tVL\tIS\tPN\tSU\tSI\tMA\tBP\tEP\tAR\tDI\tDL\tD2\tEA\tPG\tWC\tWE\tSC\tGA\tPM\tOA\tHC\tHP\tDA\tUT\r\n'
+
+
+CORE_NAME_TABLE = dict(
+    WOSCC="Web of Science Core Collection",
+    BCI="BIOSIS Citation Index",
+    SCIELO="SciELO Citation Index",
+    RSCI="Russian Science Citation Index",
+    CSCD="Chinese Science Citation Database℠",
+    ARCI="Arabic Citation Index",
+    DIIDW="Derwent Innovations Index",
+    PPRN="",
+    PQDT="ProQuest ™ Dissertations & Theses Citation Index"
+)
+NAV_NAME_TABLE = dict(
+    SCI="Science Citation Index Expanded (SCI-Expanded)",
+    ESCI="Emerging Sources Citation Index (ESCI)",
+    SSCI="Social Sciences Citation Index (SSCI)",
+    ISTP="Conference Proceedings Citation Index – Science (CPCI-S)",
+    BSCI="Book Citation Index – Science (BKCI-S)",
+    AHCI="Arts & Humanities Citation Index (A&HCI)",
+    IC="Index Chemicus (IC)",
+    ISSHP="Conference Proceedings Citation Index – Social Sciences & Humanities (CPCI-SSH)"
+)
+
+TASK_CONFIG = {
+    "school_id": 83,
+    "school_name": "北京林业大学",
+    "search_policy": """OG=(Beijing Forestry University)""",
+    "crawl_year": [2021, 2022, 2023],
+    "source_type": 1,
+    "priority": 10,
+    "is_important": 1,
+    "update_interval": 60 * 60 * 24 * 14,
+    "create_time": datetime.now(),
+    "last_time": datetime.now(),
+    "next_time": datetime.now(),
+    "state": 0
+}
--- a/science_article_add/science_article_add/items.py
+++ b/science_article_add/science_article_add/items.py
@ -0,0 +1,20 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScienceArticleAddItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    third_id = scrapy.Field()
+    updated_at = scrapy.Field()
+
+
+class WosLiteAddItem(ScienceArticleAddItem):
+    year = scrapy.Field()
+    query_ids = scrapy.Field()
+    school_ids = scrapy.Field()
+    task_ids = scrapy.Field()
--- a/science_article_add/science_article_add/middlewares.py
+++ b/science_article_add/science_article_add/middlewares.py
@ -0,0 +1,113 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class ScienceArticleAddSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScienceArticleAddDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class WosLiteApiXkeyDownloaderMiddleware:
+    async def process_request(self, request, spider):
+        key_param = {
+            'X-ApiKey': '941a216f25cbef0f80ee4ba58a08ef1e19dee7a4'
+        }
+        if not request.headers:
+            request.headers = key_param
+            return request
+
+        request.headers.update(key_param)
+        return request
--- a/science_article_add/science_article_add/models/init.py
+++ b/science_article_add/science_article_add/models/init.py
--- a/science_article_add/science_article_add/models/wos_model.py
+++ b/science_article_add/science_article_add/models/wos_model.py
@ -0,0 +1,318 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2023/7/13 9:40
+# @Author  : zhaoxiangpeng
+# @File    : model.py
+
+import json
+import enum
+import warnings
+from typing import List, Tuple, Any, Dict, Union
+from urllib.parse import urlencode
+
+from science_article_add.configs import wos as config
+
+false = False
+true = True
+null = None
+
+
+class WosDB(enum.Enum):
+    WOS = 1
+    CSCD = 2
+
+
+class AnalyzesEnum(enum.Enum):
+    WOSCC = ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", "DR.Value.6", "ECR.Value.6",
+             "PY.Field_D.6", "DT.Value.6", "AU.Value.6", "DX2NG.Value.6", "PEERREVIEW.Value.6"]
+    CSCD = ["TP.Value.6", "DR.Value.6", "OA.Value.6", "PY.Field_D.6", "DT.Value.6", "SJ.Value.6", "AU.Value.6",
+            "OG.Value.6", "SO.Value.6"]
+
+
+ColNameMap = dict(WOS='WOSCC', CSCD='CSCD')
+
+
+def calculate_next_page(next_page: int = 1, page_size: int = 100):
+    """
+    计算下一页的游标，即记录的序号
+    :param next_page: 下一页的页码
+    :param page_size: 每页的大小
+    :return:
+    """
+    return (next_page - 1) * page_size + 1
+
+
+def lite_base_model(usr_query: str, db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
+    if db_id is None:
+        db_id = 1
+    if first_record > 1e5:
+        warnings.warn('first_record 必须在 1 ~ 100000 之间')
+    model = {
+        'databaseId': WosDB(db_id).name,
+        'firstRecord': first_record,
+        'count': page_size,
+        'usrQuery': usr_query
+    }
+    # return urlencode(model)
+    return model
+
+
+def lite_query_model(db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
+    if db_id is None:
+        db_id = 1
+    model = {
+        'databaseId': WosDB(db_id).name,
+        'firstRecord': first_record,
+        'count': page_size,
+    }
+    return urlencode(model)
+
+
+def starter_documents_uid_get(uid, detail: str = None):
+    """
+
+    :param uid:
+    :param detail:
+    :return:
+    """
+    _query_params: List[Tuple[str, str]] = []
+    if detail is not None:
+        _query_params.append(("detail", detail))
+
+
+def starter_documents_get(q, db: WosDB = WosDB.WOS.name, limit: int = config.WOS_STARTER_PER_PAGE_LIMIT, page: int = 1, sort_field: str = None,
+                          modified_time_span=None, tc_modified_time_span=None, detail=None, **kwargs):
+    """
+    :param q:
+    :param db:
+    :param limit: 最大为50
+    :param page: 当limit为50时，范围为1~2000，也就是最多10w条
+    :param sort_field:
+    :param modified_time_span:
+    :param tc_modified_time_span:
+    :param detail: 默认全部数据，如果值为short，返回较少的字段(uid, links{record,citingArticles,references,related}, citations[{db,count}], identifiers{doi,issn})
+    :param kwargs:
+    :return:
+    """
+    _query_params: List[Tuple[str, str]] = []
+    _query_params.append(("q", q))
+    if db: pass
+    _query_params.append(("db", db))
+    _query_params.append(("limit", limit))
+    _query_params.append(("page", page))
+    if detail is not None:
+        _query_params.append(("detail", detail))
+    return _query_params
+
+
+def make_advanced_search_ut(query: str = None, wos_ids: List = None, limit: int = 50, col_name: str = "WOS") -> Dict[
+    str, Any]:
+    if query is None:
+        if wos_ids is None:
+            raise ValueError('query 和 wos_ids 必须满足其中一个不为None')
+        query = ' OR '.join([f'UT=({wos_id})' for wos_id in wos_ids])
+    # 通过一个自定义的名字去拿核心
+    product = ColNameMap[col_name]
+    model = {
+        "product": product,
+        "searchMode": "general",
+        "viewType": "search",
+        "serviceMode": "summary",
+        "search": {
+            "mode": "general",
+            "database": product,
+            "query": [
+                {
+                    "rowText": query
+                }
+            ],
+            "sets": [],
+            "options": {
+                "lemmatize": "On"
+            }
+        },
+        "retrieve": {
+            "count": limit,
+            "history": True,
+            "jcr": True,
+            "sort": "relevance",
+            "analyzes": getattr(AnalyzesEnum, product).value
+        },
+        "eventMode": None,
+        "isPreprintReview": False
+    }
+    return model
+
+
+def export_search_data_to_txt(
+        q_id: str,
+        mark_from: int = 1,
+        mark_to: int = 500,
+        col_name: str = "WOS",
+        filters: str = config.DEFAULT_EXPORT_RECORD_FILTER
+) -> Dict[str, Any]:
+    """
+    导出搜索到的记录
+    :param q_id: 通过检索得到的检索结果id
+    :param mark_from: 记录开始，包含
+    :param mark_to: 记录结束，包含
+    :param col_name: 来源库/核心
+    :param filters: fullRecord(完整记录)/fullRecordPlus(完整记录和参考文献)
+    :return:
+    """
+    if mark_to - mark_from > 500:
+        mark_to = mark_from + 499
+    model = {"parentQid": q_id, "sortBy": "relevance",
+             "displayTimesCited": "true", "displayCitedRefs": "true", "product": "UA", "colName": col_name,
+             "displayUsageInfo": "true", "fileOpt": "othersoftware", "action": "saveToTab",
+             "markFrom": str(mark_from), "markTo": str(mark_to),
+             "view": "summary", "isRefQuery": "false", "locale": "zh_CN", "filters": filters}
+    return model
+
+
+def article_detail_model(uts: Union[List[str], str], core: str = "WOSCC"):
+    """
+    详情 https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}
+    接口 https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
+    :param uts:
+    :param core:
+    :return:
+    """
+    if isinstance(uts, str):
+        uts = [uts]
+    model = {
+        "eventMode": null,
+        "isPreprintReview": false,
+        "product": core,
+        "retrieve": {
+            "first": 1, "links": "retrieve", "sort": "relevance", "count": 1, "view": "super",
+            "coll": null, "activity": false, "analyzes": null, "jcr": true, "reviews": true,
+            "highlight": null,
+            "secondaryRetrieve": {
+                "associated_data": {
+                    "sort": "relevance", "count": 10
+                },
+                "cited_references": {
+                    "sort": "author-ascending", "count": 30
+                },
+                "citing_article": {
+                    "sort": "date", "count": 2, "links": null, "view": "mini"
+                },
+                "cited_references_with_context": {
+                    "sort": "date", "count": 135, "view": "mini"
+                },
+                "recommendation_articles": {
+                    "sort": "recommendation-relevance", "count": 5, "links": null, "view": "mini"
+                },
+                "grants_to_wos_records": {
+                    "sort": "date-descending", "count": 30, "links": null, "view": "mini"
+                }
+            }
+        },
+        "search": {
+            "database": core,
+            "mode": "record_ids",
+            "uts": uts
+        },
+        "searchMode": "record_ids",
+        "viewType": "search",
+        "serviceMode": "summary",
+    }
+    return model
+
+
+# 被引用专用model
+def get_wos_core_cites(
+        uts_or_qid: str,
+        year_range: tuple = None,
+        core: str = "WOSCC",
+        parent_db: str = "WOSCC",
+        is_refine: bool = False
+):
+    """
+    https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
+    :param uts_or_qid:
+    :param year_range: 筛选的年份范围
+    :param core: 检索的数据库
+    :param parent_db:
+    :param is_refine: 是否是精炼检索
+    :return:
+    """
+    model = {
+        "eventMode": null,
+        "isPreprintReview": false,
+        "product": core,
+
+        "search": {"database": core, "mode": "citing_article", "parentDatabase": parent_db,
+                   "parentDoc": null,
+                   "parentId": {"type": "colluid", "value": uts_or_qid},
+                   "parentQid": null, "parentSort": null},
+        # "retrieve": {
+        #     "sort": "date-descending",
+        #     "count": 50,
+        #     "jcr": true,
+        #     "history": true,
+        #     "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
+        #                  "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
+        #                  "DX2NG.Value.6", "PEERREVIEW.Value.6"]
+        # },
+
+        "searchMode": "citing_article",
+        "serviceMode": "summary",
+        "viewType": "search",
+    }
+    refines = []
+    if year_range:
+        is_refine = True
+        years = list(range(*year_range)) + [year_range[-1]]
+        refines.append(dict(
+            index="PY", value=[str(year) for year in years]
+        ))
+        len(refines) and model.update({"refines": refines})
+    if is_refine:
+        model.setdefault("qid", uts_or_qid)
+        model.pop("search")
+        model.pop("isPreprintReview")
+        model.update(viewType="refine")
+    return model
+
+
+def get_aggregation_wos_cited(q_id: str, core: str = "WOSCC"):
+    """
+    获取各核心引用的聚合
+    https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream
+    """
+    model = {
+        "product": core,
+        "qid": q_id,
+        "retrieve": {
+            "analyzes": ["EDN.Value.200"]
+        },
+        "searchMode": "citing_article",
+        "viewType": "records"
+    }
+    return model
+
+
+def get_refine_count(q_id: str, count: int = 5):
+    model = {
+        "eventMode": null,
+        "product": "WOSCC",
+        "qid": q_id,
+        "refines": [
+            {"index": "EDN", "value": ["WOS.SCI", "WOS.SSCI", "WOS.AHCI"]}
+        ],
+        # "retrieve": {
+        #     "count": count, "sort": "date-descending", "history": true, "jcr": true,
+        #     "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
+        #                  "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
+        #                  "DX2NG.Value.6", "PEERREVIEW.Value.6"]
+        # },
+        "searchMode": "citing_article",
+        "serviceMode": "summary",
+        "viewType": "refine",
+    }
+    return model
+
+
+if __name__ == '__main__':
+    m1 = lite_base_model(WosDB.WOS)
--- a/science_article_add/science_article_add/pipelines.py
+++ b/science_article_add/science_article_add/pipelines.py
@ -0,0 +1,59 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+import pymongo
+
+
+class ScienceAddBufferPipeline:
+    def __init__(self, buffer_max_size: int = 100):
+        self.buffer = []
+        self.buffer_size = 0
+        self.buffer_max_size = buffer_max_size
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100),
+        )
+
+    def process_item(self, item, spider):
+        self.buffer.append(item)
+        return item
+
+    def close_spider(self, spider):
+        self.buffer.clear()
+
+
+class ScienceArticleAddPipeline(ScienceAddBufferPipeline):
+    def __init__(self, mongo_uri, mongo_db, buffer_max_size):
+        super().__init__(buffer_max_size=buffer_max_size)
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+        self.client = None
+        self.db = None
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
+            buffer_max_size=crawler.settings.get("BUFFER_MAX_SIZE", 100),
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        super().process_item(item, spider)
+        if self.buffer_size >= self.buffer_max_size:
+            self.buffer.clear()
+        return item
--- a/science_article_add/science_article_add/scripts/get_db_task.py
+++ b/science_article_add/science_article_add/scripts/get_db_task.py
@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/9/15 14:36
+# @Author  : zhaoxiangpeng
+# @File    : get_db_task.py
+
+import pymysql
+
+FULL_QUERY = """
+SELECT r.%(org_id)s, r.%(org_name)s, r.%(query_id)s, q.%(content)s, q.%(source_type)s 
+FROM task_search_strategy AS q JOIN relation_org_query AS r ON r.query_id = q.id
+WHERE q.id = %(q_id)s
+"""
+STRATEGY_FIELDS = ['org_id', 'org_name', 'query_id', 'content', 'source_type']
+
+
+class TaskManager:
+    def __init__(self):
+        self.client: pymysql.Connection = pymysql.connect(host='43.140.203.187', port=3306,
+                                                          database='science_data_dept', user='science-data-dept',
+                                                          passwd='datadept1509', )
+
+    def get_task_from_mysql(self):
+        cursor = self.client.cursor()
+        record_fields = ['id', 'batch_date', 'query_id', 'task_condition', 'is_done']
+        sql = "select %(fields)s from task_batch_record" % {'fields': ', '.join(record_fields)}
+        try:
+            cursor.execute(sql)
+            result = cursor.fetchone()
+            task_record_dic = dict(zip(record_fields, result))
+            fill = dict(zip(STRATEGY_FIELDS, STRATEGY_FIELDS))
+            fill.update(q_id=task_record_dic.get("query_id"))
+            cursor.execute(
+                FULL_QUERY % fill,
+            )
+            result = cursor.fetchone()
+            task_dic = dict(zip(STRATEGY_FIELDS, result))
+            task_dic.update(task_record_dic)
+        except Exception as exc:
+            raise exc
+        else:
+            print(task_dic)
+            return task_dic
+        finally:
+            cursor.close()
+
+
+if __name__ == '__main__':
+    tm = TaskManager()
+    tm.get_task_from_mysql()
--- a/science_article_add/science_article_add/settings.py
+++ b/science_article_add/science_article_add/settings.py
@ -0,0 +1,94 @@
+# Scrapy settings for science_article_add project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "science_article_add"
+
+SPIDER_MODULES = ["science_article_add.spiders"]
+NEWSPIDER_MODULE = "science_article_add.spiders"
+
+ADDONS = {}
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "science_article_add (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Concurrency and throttling settings
+#CONCURRENT_REQUESTS = 16
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
+DOWNLOAD_DELAY = 1
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "science_article_add.middlewares.ScienceArticleAddSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+RETRY_ENABLED = True
+RETRY_TIMES = 2  # 重试3次
+# RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404]  # 增加了一些常见的错误码
+#DOWNLOADER_MIDDLEWARES = {
+#    "science_article_add.middlewares.ScienceArticleAddDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "science_article_add.pipelines.ScienceArticleAddPipeline": 300,
+#}
+MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
+MONGO_DATABASE = "science2"
+
+REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+FEED_EXPORT_ENCODING = "utf-8"
--- a/science_article_add/science_article_add/spiders/init.py
+++ b/science_article_add/science_article_add/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/science_article_add/science_article_add/spiders/wos_latest_increment.py
+++ b/science_article_add/science_article_add/spiders/wos_latest_increment.py
@ -0,0 +1,138 @@
+import math
+from urllib.parse import urlencode
+from copy import deepcopy
+import scrapy
+
+from science_article_add.items import WosLiteAddItem
+from science_article_add.models import wos_model as model
+from science_article_add.configs import wos as config
+from science_article_add.utils import tools
+
+
+def calculate_next_page(next_page: int = 1, page_size: int = 100):
+    return (next_page - 1) * page_size + 1
+
+
+class WosLatestIncrementSpider(scrapy.Spider):
+    name = "wos_latest_increment"
+    # allowed_domains = ["wos-api.clarivate.com"]
+    # start_urls = ["https://wos-api.clarivate.com/api/woslite"]
+    custom_settings = dict(
+        DOWNLOADER_MIDDLEWARES={
+            "science_article_add.middlewares.WosLiteApiXkeyDownloaderMiddleware": 500
+        },
+        ITEM_PIPELINES={
+            "science_article_add.pipelines.ScienceAddBufferPipeline": 300,
+        }
+    )
+
+    def __init__(self, task_obj):
+        scrapy.Spider.__init__(self)
+        self.task_obj = task_obj
+        self.record_id = task_obj['id']
+        self.org_id = task_obj['org_id']
+        self.org_name = task_obj['org_name']
+        self.query_id = task_obj['query_id']
+        self.query_content = task_obj['content']
+        self.query_condition = task_obj['task_condition']
+
+    async def start(self):
+        full_query = self.query_content
+        if self.query_condition is not None:
+            full_query = '%(query)s %(condition)s' % {'query': self.query_content, 'condition': self.query_condition}
+        yield scrapy.Request(url=config.WOS_LITE_QUERY_FIRST_API + '?' + urlencode(model.lite_base_model(usr_query=full_query)),
+                             dont_filter=True,
+                             meta={'query': full_query, 'PAGE': 1})
+
+    async def parse(self, response, **kwargs):
+        meta = response.meta
+        request = response.request
+        task_query_id = self.query_id
+        task_org_id = self.org_id
+        task_record_id = self.record_id
+        self.logger.debug('%s: %s' % ('parse_query_api', meta))
+
+        resp_result = response.json()
+
+        query_result = resp_result.get('QueryResult')
+        datas = resp_result.get('Data')
+
+        query_id = query_result.get('QueryID')
+        records_found = query_result.get('RecordsFound')
+        max_page = math.ceil(records_found / 100)
+        meta_copy: dict = deepcopy(meta)
+        meta_copy.update({'MAX_PAGE': max_page})
+        meta_copy.update({'TOTAL': records_found})
+        meta_copy.update({'QUERY_ID': query_id})
+        meta_copy.update({'next_page': meta['PAGE'] + 1})
+        meta_copy.update({'PAGE': meta['PAGE'] + 1})
+        meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])})
+
+        for data in datas:
+            add_item = WosLiteAddItem()
+            # 入库年份优先按照自己指定的
+            to_db_year = meta.get("search_year")
+            if not to_db_year:
+                publish_year = data.get("Source", {}).get("Published.BiblioYear", [])
+                if publish_year:
+                    to_db_year = tools.str2int(publish_year[0])
+            add_item["third_id"] = data.get('UT')
+            add_item["year"] = to_db_year
+            add_item["query_ids"] = [task_query_id]
+            add_item["school_ids"] = [task_org_id]
+            add_item["task_ids"] = [task_record_id]
+            yield add_item
+
+        yield scrapy.Request(
+            url=config.WOS_LITE_QUERY_API + f'/{query_id}',
+            body=model.lite_query_model(**meta_copy),
+            meta=meta_copy,
+            callback=self.parse_query_api,
+        )
+
+    async def parse_query_api(self, response, **kwargs):
+        meta = response.meta
+        task_query_id = self.query_id
+        task_org_id = self.org_id
+        task_record_id = self.record_id
+        self.logger.debug("""
+        %s
+        %s""" % ('parse_query_api', meta))
+
+        resp_result = response.json()
+        query_id = meta.get('QUERY_ID')
+
+        datas = resp_result.get('Data', [])
+        if len(datas):
+            for data in datas:
+                add_item = WosLiteAddItem()
+                # 入库年份优先按照自己指定的
+                to_db_year = meta.get("search_year")
+                if not to_db_year:
+                    publish_year = data.get("Source", {}).get("Published.BiblioYear", [])
+                    if publish_year:
+                        to_db_year = tools.str2int(publish_year[0])
+                add_item["third_id"] = data.get('UT')
+                add_item["year"] = to_db_year
+                add_item["query_ids"] = [task_query_id]
+                add_item["school_ids"] = [task_org_id]
+                add_item["task_ids"] = [task_record_id]
+                yield add_item
+        else:
+            # 根据条件记录生成sql记录结果
+            update_state_condition = ('batch_date = "%(batch_date)s"\n'
+                                      'query_id = %(query_id)s\n'
+                                      'task_condition = "%(task_condition)s"') % self.task_obj
+            self.logger.warning('没有数据了\n%s' % update_state_condition)
+            return
+        if meta['first_record'] < meta['TOTAL']:
+            meta_copy = deepcopy(meta)
+            meta_copy.update({'next_page': meta['PAGE'] + 1})
+            meta_copy.update({'PAGE': meta['PAGE'] + 1})
+            meta_copy.update({'first_record': calculate_next_page(meta_copy['next_page'])})
+            yield scrapy.Request(
+                url=config.WOS_LITE_QUERY_API + f'/{query_id}',
+                body=model.lite_query_model(**meta_copy),
+                meta=meta_copy,
+                callback=self.parse_query_api,
+            )
--- a/science_article_add/science_article_add/utils/init.py
+++ b/science_article_add/science_article_add/utils/init.py
--- a/science_article_add/science_article_add/utils/tools.py
+++ b/science_article_add/science_article_add/utils/tools.py
@ -0,0 +1,8 @@
+def str2int(val, replace=0):
+    try:
+        val = int(val)
+    except ValueError:
+        val = replace
+    except TypeError:
+        val = replace
+    return val
--- a/science_article_add/scrapy.cfg
+++ b/science_article_add/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = science_article_add.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = science_article_add