science-spider2/science_article_wos/science_article_wos/models/wos_model.py

# -*- coding: utf-8 -*-
# @Time    : 2023/7/13 9:40
# @Author  : zhaoxiangpeng
# @File    : model.py

import json
import enum
import warnings
from typing import List, Tuple, Any, Dict, Union
from urllib.parse import urlencode

from science_article_wos.configs import wos as config

false = False
true = True
null = None


class WosDB(enum.Enum):
    WOS = 1
    CSCD = 2


class AnalyzesEnum(enum.Enum):
    WOSCC = ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6", "DR.Value.6", "ECR.Value.6",
             "PY.Field_D.6", "DT.Value.6", "AU.Value.6", "DX2NG.Value.6", "PEERREVIEW.Value.6"]
    CSCD = ["TP.Value.6", "DR.Value.6", "OA.Value.6", "PY.Field_D.6", "DT.Value.6", "SJ.Value.6", "AU.Value.6",
            "OG.Value.6", "SO.Value.6"]


ColNameMap = dict(WOS='WOSCC', CSCD='CSCD')


def calculate_next_page(next_page: int = 1, page_size: int = 100):
    """
    计算下一页的游标，即记录的序号
    :param next_page: 下一页的页码
    :param page_size: 每页的大小
    :return:
    """
    return (next_page - 1) * page_size + 1


def lite_base_model(usr_query: str, db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
    if db_id is None:
        db_id = 1
    if first_record > 1e5:
        warnings.warn('first_record 必须在 1 ~ 100000 之间')
    model = {
        'databaseId': WosDB(db_id).name,
        'firstRecord': first_record,
        'count': page_size,
        'usrQuery': usr_query
    }
    # return urlencode(model)
    return model


def lite_query_model(db_id: int = None, first_record: int = 1, page_size: int = 100, **kwargs):
    if db_id is None:
        db_id = 1
    model = {
        'databaseId': WosDB(db_id).name,
        'firstRecord': first_record,
        'count': page_size,
    }
    return urlencode(model)


def starter_documents_uid_get(uid, detail: str = None):
    """

    :param uid:
    :param detail:
    :return:
    """
    _query_params: List[Tuple[str, str]] = []
    if detail is not None:
        _query_params.append(("detail", detail))


def starter_documents_get(q, db: WosDB = WosDB.WOS.name, limit: int = config.WOS_STARTER_PER_PAGE_LIMIT, page: int = 1, sort_field: str = None,
                          modified_time_span=None, tc_modified_time_span=None, detail=None, **kwargs):
    """
    :param q:
    :param db:
    :param limit: 最大为50
    :param page: 当limit为50时，范围为1~2000，也就是最多10w条
    :param sort_field:
    :param modified_time_span:
    :param tc_modified_time_span:
    :param detail: 默认全部数据，如果值为short，返回较少的字段(uid, links{record,citingArticles,references,related}, citations[{db,count}], identifiers{doi,issn})
    :param kwargs:
    :return:
    """
    _query_params: List[Tuple[str, str]] = []
    _query_params.append(("q", q))
    if db: pass
    _query_params.append(("db", db))
    _query_params.append(("limit", limit))
    _query_params.append(("page", page))
    if detail is not None:
        _query_params.append(("detail", detail))
    return _query_params


def make_advanced_search_ut(query: str = None, wos_ids: List = None, limit: int = 50, col_name: str = "WOS") -> Dict[
    str, Any]:
    if query is None:
        if wos_ids is None:
            raise ValueError('query 和 wos_ids 必须满足其中一个不为None')
        query = ' OR '.join([f'UT=({wos_id})' for wos_id in wos_ids])
    # 通过一个自定义的名字去拿核心
    product = ColNameMap[col_name]
    model = {
        "product": product,
        "searchMode": "general",
        "viewType": "search",
        "serviceMode": "summary",
        "search": {
            "mode": "general",
            "database": product,
            "query": [
                {
                    "rowText": query
                }
            ],
            "sets": [],
            "options": {
                "lemmatize": "On"
            }
        },
        "retrieve": {
            "count": limit,
            "history": True,
            "jcr": True,
            "sort": "relevance",
            "analyzes": getattr(AnalyzesEnum, product).value
        },
        "eventMode": None,
        "isPreprintReview": False
    }
    return model


def export_search_data_to_txt(
        q_id: str,
        mark_from: int = 1,
        mark_to: int = 500,
        col_name: str = "WOS",
        filters: str = config.DEFAULT_EXPORT_RECORD_FILTER
) -> Dict[str, Any]:
    """
    导出搜索到的记录
    :param q_id: 通过检索得到的检索结果id
    :param mark_from: 记录开始，包含
    :param mark_to: 记录结束，包含
    :param col_name: 来源库/核心
    :param filters: fullRecord(完整记录)/fullRecordPlus(完整记录和参考文献)
    :return:
    """
    if mark_to - mark_from > 500:
        mark_to = mark_from + 499
    model = {"parentQid": q_id, "sortBy": "relevance",
             "displayTimesCited": "true", "displayCitedRefs": "true", "product": "UA", "colName": col_name,
             "displayUsageInfo": "true", "fileOpt": "othersoftware", "action": "saveToTab",
             "markFrom": str(mark_from), "markTo": str(mark_to),
             "view": "summary", "isRefQuery": "false", "locale": "zh_CN", "filters": filters}
    return model


def article_detail_model(uts: Union[List[str], str], core: str = "WOSCC"):
    """
    详情 https://webofscience.clarivate.cn/wos/woscc/full-record/{wos_id}
    接口 https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
    :param uts:
    :param core:
    :return:
    """
    if isinstance(uts, str):
        uts = [uts]
    model = {
        "eventMode": null,
        "isPreprintReview": false,
        "product": core,
        "retrieve": {
            "first": 1, "links": "retrieve", "sort": "relevance", "count": 1, "view": "super",
            "coll": null, "activity": false, "analyzes": null, "jcr": true, "reviews": true,
            "highlight": null,
            "secondaryRetrieve": {
                "associated_data": {
                    "sort": "relevance", "count": 10
                },
                "cited_references": {
                    "sort": "author-ascending", "count": 30
                },
                "citing_article": {
                    "sort": "date", "count": 2, "links": null, "view": "mini"
                },
                "cited_references_with_context": {
                    "sort": "date", "count": 135, "view": "mini"
                },
                "recommendation_articles": {
                    "sort": "recommendation-relevance", "count": 5, "links": null, "view": "mini"
                },
                "grants_to_wos_records": {
                    "sort": "date-descending", "count": 30, "links": null, "view": "mini"
                }
            }
        },
        "search": {
            "database": core,
            "mode": "record_ids",
            "uts": uts
        },
        "searchMode": "record_ids",
        "viewType": "search",
        "serviceMode": "summary",
    }
    return model


# 被引用专用model
def get_wos_core_cites(
        uts_or_qid: str,
        year_range: tuple = None,
        core: str = "WOSCC",
        parent_db: str = "WOSCC",
        is_refine: bool = False
):
    """
    https://webofscience.clarivate.cn/api/wosnx/core/runQuerySearch
    :param uts_or_qid:
    :param year_range: 筛选的年份范围
    :param core: 检索的数据库
    :param parent_db:
    :param is_refine: 是否是精炼检索
    :return:
    """
    model = {
        "eventMode": null,
        "isPreprintReview": false,
        "product": core,

        "search": {"database": core, "mode": "citing_article", "parentDatabase": parent_db,
                   "parentDoc": null,
                   "parentId": {"type": "colluid", "value": uts_or_qid},
                   "parentQid": null, "parentSort": null},
        # "retrieve": {
        #     "sort": "date-descending",
        #     "count": 50,
        #     "jcr": true,
        #     "history": true,
        #     "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
        #                  "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
        #                  "DX2NG.Value.6", "PEERREVIEW.Value.6"]
        # },

        "searchMode": "citing_article",
        "serviceMode": "summary",
        "viewType": "search",
    }
    refines = []
    if year_range:
        is_refine = True
        years = list(range(*year_range)) + [year_range[-1]]
        refines.append(dict(
            index="PY", value=[str(year) for year in years]
        ))
        len(refines) and model.update({"refines": refines})
    if is_refine:
        model.setdefault("qid", uts_or_qid)
        model.pop("search")
        model.pop("isPreprintReview")
        model.update(viewType="refine")
    return model


def get_aggregation_wos_cited(q_id: str, core: str = "WOSCC"):
    """
    获取各核心引用的聚合
    https://webofscience.clarivate.cn/api/wosnx/core/runQueryGetRecordsStream
    """
    model = {
        "product": core,
        "qid": q_id,
        "retrieve": {
            "analyzes": ["EDN.Value.200"]
        },
        "searchMode": "citing_article",
        "viewType": "records"
    }
    return model


def get_refine_count(q_id: str, count: int = 5):
    model = {
        "eventMode": null,
        "product": "WOSCC",
        "qid": q_id,
        "refines": [
            {"index": "EDN", "value": ["WOS.SCI", "WOS.SSCI", "WOS.AHCI"]}
        ],
        # "retrieve": {
        #     "count": count, "sort": "date-descending", "history": true, "jcr": true,
        #     "analyzes": ["TP.Value.6", "REVIEW.Value.6", "EARLY ACCESS.Value.6", "OA.Value.6",
        #                  "DR.Value.6", "ECR.Value.6", "PY.Field_D.6", "DT.Value.6", "AU.Value.6",
        #                  "DX2NG.Value.6", "PEERREVIEW.Value.6"]
        # },
        "searchMode": "citing_article",
        "serviceMode": "summary",
        "viewType": "refine",
    }
    return model


def get_record_info(body: bytes, sep: Union[str, bytes] = b'\n'):
    resp_texts = body.strip().split(sep)
    query_id = None
    records_found = 0
    for resp_text in resp_texts:
        resp_row_dict: dict = json.loads(resp_text)
        if resp_row_dict.get("key") == "searchInfo":
            query_id = resp_row_dict.get("payload", {}).get("QueryID")
            records_found = resp_row_dict.get("payload", {}).get("RecordsFound")  # 找到的记录
            break  # 找到就结束
    return query_id, records_found


if __name__ == '__main__':
    m1 = lite_base_model(WosDB.WOS)