cssci spider

1 week ago · 21963b7f80
parent 3e3d237388
commit 21963b7f80
18 changed files with 1429 additions and 0 deletions
--- a/science_article_cssci/science_article_cssci/init.py
+++ b/science_article_cssci/science_article_cssci/init.py
--- a/science_article_cssci/science_article_cssci/configs/cssci.py
+++ b/science_article_cssci/science_article_cssci/configs/cssci.py
@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2026/1/19 16:51
+# @Author  : zhaoxiangpeng
+# @File    : cssci.py
+# 数据来源名
+SOURCE_NAME = 'cssci'
+
+
+# api配置
+CSSCI_CONTROL_API = 'http://cssci.nju.edu.cn/control/controllers.php'
+# 搜索列表的接口
+CSSCI_SEARCH_API = CSSCI_CONTROL_API
+# 详情页接口
+CSSCI_ARTICLE_DETAIL_API = CSSCI_CONTROL_API
+
+# 请求headers配置
+POST_HEADERS_CONFIG = {
+    'content-type': 'application/x-www-form-urlencoded',
+    'host': 'cssci.nju.edu.cn',
+    'origin': 'http://cssci.nju.edu.cn',
+    'referer': 'http://cssci.nju.edu.cn/index.html',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
+}
+
+# 任务条件
+TASK_CONDITION_CONFIG = dict(
+    task_table='task_search_strategy',
+    task_field=["id", "content", "param"],
+    task_condition="source_type = 8",
+    db_type="mysql",
+    batch_limit=1
+)
--- a/science_article_cssci/science_article_cssci/db_utils/init.py
+++ b/science_article_cssci/science_article_cssci/db_utils/init.py
--- a/science_article_cssci/science_article_cssci/db_utils/kafka.py
+++ b/science_article_cssci/science_article_cssci/db_utils/kafka.py
@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2026/1/16 14:11
+# @Author  : zhaoxiangpeng
+# @File    : kafka.py
+import time
+import logging
+from aiokafka import AIOKafkaProducer
+from kafka.errors import KafkaError
+logger = logging.getLogger(__name__)
+
+
+class KafkaUtil:
+    MAX_RETRIES = 3
+    BASE_DELAY = 2  # 秒：指数退避基础
+
+    def __init__(self, producer):
+        self._kafka_producer: AIOKafkaProducer = producer
+        self.logger = logging.getLogger(__name__)
+
+    async def producer_send(self, data, topic):
+        try:
+            future = await self._kafka_producer.send(topic, value=data)
+            future.add_callback(lambda metadata: self.on_send_success(metadata, data))
+            future.add_errback(self.on_send_error_factory(data, topic))
+        except KafkaError as e:
+            self.logger.error(f'{data.get("id", "")} - 生产失败\n'
+                              f'失败原因：{e}')
+
+    def on_send_success(self, record_metadata, data):
+        msg_id = data.get("id", "unknown_id")
+        self.logger.info(
+            f"{msg_id} - 成功发送到: "
+            f"topic={record_metadata.topic} - partition={record_metadata.partition} - offset={record_metadata.offset}")
+
+    def on_send_error_factory(self, data, topic, retries=0):
+        msg_id = data.get("id", "unknown_id")
+
+        def on_send_error(exc):
+            self.logger.info(f"{msg_id} - 第 {retries + 1} 次发送失败: {exc}")
+            if retries < self.MAX_RETRIES:
+                delay = self.BASE_DELAY * (2 ** retries)
+                self.logger.info(f"{msg_id} - {delay}s 后重试，第 {retries + 1}/{self.MAX_RETRIES} 次")
+                time.sleep(delay)
+                future = self._kafka_producer.send(topic, value=data)
+                future.add_callback(lambda metadata: self.on_send_success(metadata, data))
+                future.add_errback(self.on_send_error_factory(data, topic, retries + 1))
+            else:
+                self.logger.error(f"{msg_id} - 超过最大重试次数, 请检查")
+
+        return on_send_error
--- a/science_article_cssci/science_article_cssci/db_utils/mongo.py
+++ b/science_article_cssci/science_article_cssci/db_utils/mongo.py
@ -0,0 +1,92 @@
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, Optional, Dict, Tuple
+from pymongo import MongoClient
+from pymongo import UpdateOne
+from pymongo.errors import DuplicateKeyError, BulkWriteError
+
+if TYPE_CHECKING:
+    from pymongo.database import Database
+    from pymongo.collection import Collection
+    from pymongo.results import InsertOneResult, InsertManyResult, BulkWriteResult
+
+
+def build_update_query(update_data: dict, replace: bool = True) -> dict:
+    """
+    如果replace为True，则直接覆盖原有的document
+    """
+    update_query = {}
+    if not update_data:
+        return {}
+    for key, val in update_data.items():
+        if replace:
+            update_query.setdefault(
+                "$set", {}
+            ).update(
+                {key: val}
+            )
+        else:
+            if isinstance(val, list):
+                update_query.setdefault(
+                    "$addToSet", {}
+                ).update({
+                    key: {"$each": val}
+                })
+            else:
+                update_query.setdefault(
+                    "$set", {}
+                ).update(
+                    {key: val}
+                )
+    return update_query
+
+
+def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]:
+    update_query = {}
+    if not update_data:
+        return {}, {}
+
+    for key, val in update_data.items():
+        if replace:
+            update_query.setdefault(
+                "$set", {}
+            ).update(
+                {key: val}
+            )
+        else:
+            if isinstance(val, list):
+                update_query.setdefault(
+                    "$addToSet", {}
+                ).update({
+                    key: {"$each": val}
+                })
+            else:
+                update_query.setdefault(
+                    "$set", {}
+                ).update(
+                    {key: val}
+                )
+    return filter_query, update_query
+
+
+class MongoDBUtils:
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+        self.client: MongoClient = None
+        self.db: Database = None
+
+    def insert2db(self, items, tablename, **kwargs) -> InsertOneResult:
+        collection: Collection = self.db.get_collection(tablename)
+        result: InsertOneResult = collection.insert_one(items, **kwargs)
+        return result
+
+    def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult:
+        collection: Collection = self.db.get_collection(tablename)
+        result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs)
+        return result
+
+    def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult:
+        collection: Collection = self.db.get_collection(tablename)
+        bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs)
+        return bulk_results
--- a/science_article_cssci/science_article_cssci/items.py
+++ b/science_article_cssci/science_article_cssci/items.py
@ -0,0 +1,34 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ScienceArticleCssciItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+
+
+class AddItemBase(scrapy.Item):
+    third_id = scrapy.Field()
+    updated_at = scrapy.Field()
+
+
+class ArticleItem(AddItemBase):
+    exported = scrapy.Field()
+
+
+class ArticleCitedItem(AddItemBase):
+    cited = scrapy.Field()
+
+
+class CssciArticleItem(ArticleItem):
+    __tablename__ = 'data_cssci_article'
+
+    third_id = scrapy.Field()
+    resp_raw = scrapy.Field()
+    detailed = scrapy.Field()
+    updated_at = scrapy.Field()
--- a/science_article_cssci/science_article_cssci/middlewares.py
+++ b/science_article_cssci/science_article_cssci/middlewares.py
@ -0,0 +1,143 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+import logging
+from typing import Optional
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+from scrapy.http.headers import Headers
+
+from science_article_cssci.scripts.get_cookie import GetSessionID
+logger = logging.getLogger(__name__)
+
+
+class ScienceArticleCssciSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class ScienceArticleCssciDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class CssciCookieMiddleware:
+    ss: Optional[GetSessionID]
+
+    def __init__(self, custom_headers: dict, cookie_cfg: dict):
+        self.custom_headers = custom_headers
+        self.headers = Headers(self.custom_headers)
+        self.cookies_pool_config = cookie_cfg
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        settings = crawler.settings
+        post_headers = crawler.settings.getdict('POST_HEADERS_CONFIG')
+        s = cls(
+            custom_headers=post_headers,
+            cookie_cfg=dict(
+                redis_uri=settings.get("COOKIE_POOL_CONFIG"),
+                pool_key=settings.get("COOKIE_POOL_REDIS_KEY"),
+                ttl=settings.get("COOKIE_REDIS_TTL")
+            )
+        )
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def spider_opened(self, spider):
+        self.ss = GetSessionID(**self.cookies_pool_config)
+
+    async def process_request(self, request, spider):
+        cookie_1 = await self.ss.get_cookie_from_redis()
+        if not cookie_1:
+            cookie_1 = await self.ss.get_cookie_to_redis()
+            logger.info("""
+            没有可用cookie
+            重新获取:    %s""" % cookie_1)
+
+        request.cookies = cookie_1
+        request.headers = self.headers
--- a/science_article_cssci/science_article_cssci/pipelines.py
+++ b/science_article_cssci/science_article_cssci/pipelines.py
@ -0,0 +1,275 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# useful for handling different item types with a single interface
+from __future__ import annotations
+import re
+import json
+import logging
+from datetime import datetime
+from typing import TYPE_CHECKING, Tuple, Union, Optional
+import scrapy
+from itemadapter import ItemAdapter
+from kafka import KafkaProducer
+from pymongo import MongoClient
+from pymongo.errors import (
+    DuplicateKeyError,
+    BulkWriteError
+)
+from science_article_cssci.db_utils.mongo import MongoDBUtils, build_update_query
+from science_article_cssci.db_utils.kafka import KafkaUtil
+from science_article_cssci.scripts.field_assembly import FieldAssembly
+
+if TYPE_CHECKING:
+    from scrapy.crawler import Crawler
+    from scrapy.statscollectors import StatsCollector
+    from pymongo.collection import Collection
+
+mongo_logger = logging.getLogger('pymongo')
+mongo_logger.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+
+
+class ScienceArticleCssciPipeline:
+    def process_item(self, item, spider):
+        return item
+
+
+class MongoPipeline(MongoDBUtils):
+    def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
+        super().__init__(mongo_uri, mongo_db)
+        self.stats: StatsCollector = stats
+        self.insert_failure_update_enable = True
+        self.duplicate_cover_enable = False  # 重复项覆盖
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        m = cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
+            stats=crawler.stats
+        )
+        return m
+
+    def open_spider(self, spider):
+        self.client = MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def process_item(self, item, spider) -> scrapy.Item:
+        """
+        插入遇到错误不处理
+        """
+        adapter = ItemAdapter(item)
+        tablename = self._get_item_table(item)
+        collection = self.db.get_collection(tablename)
+        d = adapter.asdict()
+        try:
+            collection.insert_one(d)
+            self.stats.inc_value("item2db_inserted/{}".format(tablename))
+        except DuplicateKeyError as duplicate_error:
+            self.stats.inc_value("item2db_duplicate/{}".format(tablename))
+            self.stats.inc_value(f"item_dropped_reasons_count/duplicate")
+        except Exception:
+            raise
+        return item
+
+    def process_item_update(self, item, spider) -> scrapy.Item:
+        """
+        插入遇到错误进行更新
+        """
+        adapter = ItemAdapter(item)
+        tablename = self._get_item_table(item)
+        collection = self.db.get_collection(tablename)
+        d = adapter.asdict()
+        try:
+            collection.insert_one(d)
+            self.stats.inc_value("item2db_inserted/{}".format(tablename))
+        except DuplicateKeyError as duplicate_error:
+            if self.insert_failure_update_enable:
+                write_error = duplicate_error.details
+                filter_query, update_query = self._pick_filter_update(write_error, doc=d)
+                updated_at_query = None  # 删除不确定因素的时间防止影响更新（更新除了task_id外的字段不需要处理这个）
+
+                key_pattern = write_error.get('keyPattern')
+                key_value = write_error.get('keyValue')
+                logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
+
+                # 专门用来适配增量的任务
+                task_ids = update_query.pop("task_ids", None)
+                if task_ids:
+                    # task_id一定会引起变动，所以先处理
+                    task_id_query = {'task_ids': task_ids}
+                    collection.update_one(filter=filter_query, update=build_update_query(task_id_query, replace=False))
+                    updated_at_query = {"updated_at": update_query.pop('updated_at', None)}
+
+                update_q = build_update_query(update_query, replace=self.duplicate_cover_enable)
+                up_result = collection.update_one(filter=key_value, update=update_q, upsert=True)
+                if up_result.matched_count == up_result.modified_count == 1:
+                    # 变动了就要修改更新的时间（其实没变也要更新，这样可以知道什么时候动过这条数据）
+                    if updated_at_query:
+                        collection.update_one(filter=key_value, update={"$set": updated_at_query})
+                    self.stats.inc_value("item2db_updated/{}".format(tablename))
+        except Exception:
+            raise
+
+        return item
+
+    @staticmethod
+    def _pick_filter_update(write_error, doc: dict = None):
+        original_doc = write_error.get('op', doc)  # 插入的数据
+        key_pattern = write_error.get('keyPattern')
+        original_doc.pop("_id", None)  # 删掉插入失败产生的_id
+        filter_query = {}
+        update_query = {key: val for key, val in original_doc.items() if val}
+
+        for key in key_pattern.keys():
+            filter_query.update({key: update_query.pop(key, None)})
+        return filter_query, update_query
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    @staticmethod
+    def _get_item_table(item) -> str:
+        """获取Item类型"""
+        if hasattr(item, '__tablename__'):
+            return item.__class__.__tablename__
+        return 'items_null_table'
+
+
+class Mongo2SciencePipeline(MongoPipeline):
+    def process_item(self, item, spider):
+        d = self.parse2science(item)
+        table = self._get_item_table(item)
+        coll = self.db.get_collection(table)
+        return d
+
+    @staticmethod
+    def parse2science(item) -> dict:
+        def change_qi(tmp_str):
+            return re.sub(r'^0|0$', '', tmp_str)
+
+        def change_string(tmp_str):
+            tmp = tmp_str.split("aaa")
+            tmp_z = []
+            for t in tmp:
+                if len(t) > 1:
+                    tmp_z.append(t)
+            return tmp_z
+
+        adapter = ItemAdapter(item)
+        third_id = adapter['third_id']
+        resp_raw = adapter['resp_raw']
+        resp_raw = json.loads(resp_raw)
+        authors: list = resp_raw['author']
+        authors = [dict(zzmc=au_info['zzmc'], jgmc=au_info['jgmc'], bmmc=au_info['bmmc']) for au_info in
+                   authors or []]  # 作者.学校.院系
+        # pprint(authors)
+        catations: list = resp_raw.get('catation')
+        contents: list = resp_raw.get('contents', [])
+        body = [c for c in contents if c.get("sno") == adapter['third_id']]
+        if body:
+            content = body[0]
+        else:
+            content = {}
+        d = dict(
+            sno=third_id,
+            lypm=content['lypm'],  # 篇名
+            blpm=content['blpm'],  # 英文篇名
+            zzjg=authors,  # 作者和机构
+            wzlx=content['wzlx'],  # 文献类型
+            xkfl1=content['xkfl1'],  # 学科类别1
+            xkfl2=content.get('xkfl2', ''),  # 学科类别2
+            xkdm1=content['xkdm1'],  # 中图类号1
+            xkdm2=content.get('xkdm2', ''),  # 中图类号2
+            xmlb=content.get('xmlb', ''),  # 基金项目
+            qkmc=content.get('qkmc', ''),  # 来源期刊
+            # (nian)年(juan)卷第(qi).replace(/^0|0$/g,'')期：(ym)
+            nian=content.get('nian', ''),  # 年
+            juan=content.get('juan', ''),  # 卷
+            qi=content.get('qi', ''),  # 期
+            ym=content.get('ym', ''),  # 页码
+            byc=change_string(content.get('byc', '')),  # 关键词
+            ckwx=catations  # 参考文献
+        )
+        return d
+
+
+class BuildDetailPipeline:
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        item['detailed'] = self.build_detailed(adapter)
+
+        return item
+
+    @staticmethod
+    def build_detailed(item):
+        resp_raw = item.get("resp_raw")
+        dd = dict(
+            **FieldAssembly.parse_detail(
+                content=resp_raw.get('content'),
+                author=resp_raw.get('author'),
+                catation=resp_raw.get('catation'),
+            )
+        )
+        return dd
+
+
+class KafkaPipeline:
+    def __init__(self, kafka_servers, topic):
+        self.kafka_servers = kafka_servers
+        self.topic = topic
+        self.producer: KafkaProducer = None
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            kafka_servers=crawler.settings.get('KAFKA_SERVERS', 'localhost:9092'),
+            topic=crawler.settings.get('KAFKA_TOPIC', 'scrapy_items')
+        )
+
+    def open_spider(self, spider):
+        self.producer: KafkaProducer = KafkaProducer(
+            bootstrap_servers=self.kafka_servers,
+            value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
+            acks='all',
+            linger_ms=50,  # 等待时间，最多等待 50ms 封包发送
+            compression_type='gzip',
+        )
+        spider.logger.info(f"Connected to Kafka at {self.kafka_servers}")
+
+    def close_spider(self, spider):
+        if self.producer:
+            self.producer.flush()
+            self.producer.close()
+            spider.logger.info("Kafka connection closed")
+
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        d = adapter.asdict()
+        d = self.build2kafka(d)
+        # 发送到Kafka
+        future = self.producer.send(
+            topic=self.topic,
+            value=d,
+            headers=[{'source_type': b'cssci'}]
+        )
+        future.add_callback(self.on_send_success)
+        future.add_callback(self.on_send_success)
+        return item
+
+    def on_send_success(self, record_metadata):
+        """发送成功回调"""
+        pass
+
+    def on_send_error(self, excp):
+        """发送失败回调"""
+        pass
+
+    def build2kafka(self, item: dict) -> dict:
+        dd = dict(
+            id=item.get("third_id"),
+            **item.get('detailed')
+        )
+        return dd
--- a/science_article_cssci/science_article_cssci/scripts/config.ini
+++ b/science_article_cssci/science_article_cssci/scripts/config.ini
@ -0,0 +1,62 @@
+[source_type]
+1 = 论文
+2 = 综述
+3 = 评论
+5 = 报告
+4 = 传记资料
+9 = 其他
+
+[source_jj]
+1 = 国家自科基金
+2 = 国家社科基金
+3 = 国家级其它基金
+4 = 教育部基金
+5 = 其他部委级基金
+6 = 中科院基金
+7 = 社科院基金
+8 = 省（市）级基金
+9 = 其它基金
+
+[yw_type]
+1 = 期刊论文
+11 = 电子文献
+10 = 法规
+9 = 标准
+8 = 报告
+7 = 汇编
+6 = 信件
+5 = 学位论文
+4 = 会议文献
+3 = 报纸
+2 = 图书
+99 = 其他
+
+[source_xk]
+630 = 管理学
+850 = 民族学
+860 = 新闻学与传播学
+870 = 图书馆、情报与文献学
+880 = 教育学
+890 = 体育学
+910 = 统计学
+920 = 心理学
+930 = 社会科学总论
+940 = 军事学
+950 = 文化学
+960 = 人文、经济地理
+970 = 环境科学
+840 = 社会学
+820 = 法学
+710 = 马克思主义
+720 = 哲学
+730 = 宗教学
+740 = 语言学
+009 = 文学
+751 = 外国文学
+752 = 中国文学
+760 = 艺术学
+770 = 历史学
+780 = 考古学
+790 = 经济学
+810 = 政治学
+999 = 其他学科
--- a/science_article_cssci/science_article_cssci/scripts/data_export.py
+++ b/science_article_cssci/science_article_cssci/scripts/data_export.py
@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2022/1/12 8:36
+# @Author  : ZhaoXiangPeng
+# @File    : data_export.py
+
+import os
+import re
+import time
+from typing import List
+from configparser import ConfigParser, NoSectionError
+
+cfg_parse = ConfigParser()
+try:
+    cfg_parse.read(os.path.join(os.path.dirname(__file__), 'config.ini'), encoding='utf-8')
+    print(cfg_parse)
+except NoSectionError:
+    pass
+else:
+    source_type = cfg_parse['source_type']
+    source_jj = cfg_parse['source_jj']
+    yw_type = cfg_parse['yw_type']
+    source_xk = cfg_parse['source_xk']
+
+
+def ckwx(wxs: list):
+    wx_list = []
+    if wxs is None:
+        return wx_list
+    for i in range(wxs.__len__()):
+        content = ''
+        content += str(i+1) + ' '
+        r1: dict = wxs[i]
+        if r1['ywlx'] == '1':  # 期刊论文
+            tmp = r1['ywcc'].replace(r1['ywnd']+'，', '').replace(r1['ywnd']+'.', '').replace(r1['ywnd']+'（','（').replace(r1['ywnd']+',','')
+            content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+            content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
+            content += f',{tmp}'
+        elif r1['ywlx'] == '3':  # 报纸
+            content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+            content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}'
+        elif r1['ywlx'] == '11':  # 电子文献,网站
+            content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+            content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
+        elif r1['ywlx'] == '99':  # 其它
+            content += r1['ywpm']
+        elif r1['ywlx'] == '7':
+            content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
+            content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
+            if len(r1["ywcc"]):
+                content += '.'+r1["ywcc"]
+            else:
+                content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else ''
+                content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else ''
+                content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else ''
+                content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else ''
+                content += f':{r1["ywym"]}' if r1['ywym'].__len__() else ''
+        else:
+            content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
+            content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
+            content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else ''
+            content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else ''
+            content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else ''
+            content += f':{r1["ywym"]}' if len(r1["ywym"]) else ''
+        wx_list.append(content)
+    return wx_list
+
+
+def process_ref_field(row: dict, to_qikan=False) -> List[dict]:
+    row_list = []
+    ckwx_list = ckwx(row.get('ckwx', []))
+    for ckwx_one in ckwx_list:
+        new_row = {}
+        new_row['rid'] = None
+        new_row['reference'] = ckwx_one
+        new_row['aid'] = row.get('sno')
+        new_row['url'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno')
+        row_list.append(new_row)
+    return row_list
+
+
+def process_field(row: dict, to_qikan=False):
+    def jtrim(x):
+        return re.sub(r'^(\s|\u00A0)+|(\s|\u00A0)+$', '', x or '')
+
+    def zzjg(jgs: list):
+        zzjg_list = []
+        for i in range(jgs.__len__()):
+            r1: dict = jgs[i]
+            vals: list = [str(i+1)] + list(r1.values())
+            zzjg_list.append('.'.join(vals))
+        return zzjg_list
+
+    def vpzz(jgs: list):
+        zz_list = []
+        for i in range(jgs.__len__()):
+            r1: dict = jgs[i]
+            zz_list.append(r1['zzmc']+f'[{i+1}]')
+        return zz_list
+
+    def vpjg(jgs: list):
+        jg_list = []
+        for i in range(jgs.__len__()):
+            r1: dict = jgs[i]
+            jg_list.append(f'[{i+1}]'+r1['jgmc']+r1['bmmc'])
+        return jg_list
+
+    def wxlx(t: str):
+        return source_type.get(t)
+
+    def xkfl(t: str):
+        return source_xk.get(t)
+
+    def change_qi(tmp_str):
+        return re.sub(r'^0|0$', '', tmp_str)
+
+    def ndjq(n, j, q, ym):
+        val = ''
+        if len(n):
+            val += f'{n}年'
+        if len(j):
+            val += f'{j}卷'
+        if len(q):
+            q = change_qi(q)
+            val += f'第{q}期'
+        if len(ym):
+            val += f'：{ym}'
+        return val
+
+    new_row = {}
+    if to_qikan is False:
+        new_row['篇名'] = row.get('lypm')
+        new_row['英文篇名'] = row.get('blpm')
+        new_row['作者及机构'] = '; '.join(zzjg(row.get('zzjg', [])))
+        new_row['文献类型'] = wxlx(row.get('wzlx', ''))
+        fl1 = xkfl(row.get('xkfl1', ''))
+        fl2 = xkfl(row.get('xkfl2', ''))
+        new_row['学科类别'] = (fl1 if fl1 is not None else '') + ('/' + fl2 if fl2 is not None else '')
+        lh1 = row.get('xkdm1', '')
+        lh2 = row.get('xkdm2', '')
+        new_row['中图类号'] = (lh1 if len(lh1) else '') + ('/' + lh2 if len(lh2) else '')
+        new_row['基金项目'] = row.get('xmlb')
+        new_row['来源期刊'] = row.get('qkmc')
+        new_row['年代卷期'] = ndjq(row.get('nian'), row.get('juan'), row.get('qi'), row.get('ym'))
+        new_row['关键词'] = '/'.join(row.get('byc', []))
+        new_row['参考文献'] = '; '.join(ckwx(row.get('ckwx', [])))
+    else:
+        new_row['序号'] = 1
+        new_row['题名'] = row.get('lypm')
+        jgzz = row.get('zzjg', [])
+        new_row['作者'] = ';'.join(vpzz(jgzz))
+        new_row['机构'] = ';'.join(vpjg(jgzz))
+        new_row['基金'] = row.get('xmlb')
+        new_row['刊名'] = row.get('qkmc')
+        new_row['年'] = row.get('nian')
+        new_row['卷'] = row.get('juan')
+        new_row['期'] = change_qi(row.get('qi'))
+        new_row['ISSN号'] = None
+        new_row['CN号'] = None
+        new_row['页码'] = str(row.get('ym'))
+        new_row['关键词'] = ';'.join(row.get('byc', []))
+        lh1 = row.get('xkdm1', '')
+        lh2 = row.get('xkdm2', '')
+        new_row['分类号'] = (lh1 if len(lh1) else '') + (';' + lh2 if len(lh2) else '')
+        new_row['文摘'] = None
+        new_row['网址'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno')
+        new_row['aid'] = row.get('sno')
+        new_row['FileName'] = None
+        new_row['引文'] = '; '.join(ckwx(row.get('ckwx', [])))
+        new_row['被引量'] = None
+    return new_row
--- a/science_article_cssci/science_article_cssci/scripts/field_assembly.py
+++ b/science_article_cssci/science_article_cssci/scripts/field_assembly.py
@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2026/1/19 17:22
+# @Author  : zhaoxiangpeng
+# @File    : field_assembly.py
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from typing import List
+
+
+def strip_symbol(text, symbol='aaa') -> str:
+    return text.strip(symbol)
+
+
+def join_list(texts: list, symbol='/'):
+    return symbol.join(texts)
+
+
+def spilt_text(text, symbol='aaa') -> List[str]:
+    r = []
+    if not text:
+        return r
+    text = strip_symbol(text)
+    texts = text.split(symbol)
+    for t in texts:
+        t = strip_symbol(t)
+        if t:
+            r.append(t)
+    return r
+
+
+def author_org(datas: list[dict]) -> list:
+    r = []
+    for data in datas:
+        sno = data.get('sno')
+        au_name = data.get('zzmc')
+        org_name = data.get('jgmc')
+        o = [org_name]
+        xueyuan = data.get('bmmc')
+        if xueyuan:
+            o.append(xueyuan)
+        r.append(f'[{au_name}]'+join_list(o, '.'))
+    return r
+
+
+def org(datas: list[dict]) -> list:
+    r = []
+    for data in datas:
+        org_name = data.get('jgmc')
+        r.append(org_name)
+    return r
+
+
+class ArticleAbs:
+    def to_dict(self):
+        pass
+
+
+class FieldAssembly:
+
+    @staticmethod
+    def parse_author():
+        return
+
+    @staticmethod
+    def parse_reference(refs: List[dict]):
+        ref_list = []
+        for i, r1 in enumerate(refs):
+            content = ''
+            content += str(i + 1) + '.'
+            if r1['ywlx'] == '1':  # 期刊论文
+                tmp = r1['ywcc'].replace(r1['ywnd'] + '，', '').replace(r1['ywnd'] + '.', '').replace(r1['ywnd'] + '（',
+                                                                                                     '（').replace(
+                    r1['ywnd'] + ',', '')
+                content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+                content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
+                content += f',{tmp}'
+            elif r1['ywlx'] == '3':  # 报纸
+                content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+                content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}'
+            elif r1['ywlx'] == '11':  # 电子文献,网站
+                content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
+                content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
+            elif r1['ywlx'] == '99':  # 其它
+                content += r1['ywpm']
+            elif r1['ywlx'] == '7':
+                content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
+                content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
+                if len(r1["ywcc"]):
+                    content += '.' + r1["ywcc"]
+                else:
+                    content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else ''
+                    content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else ''
+                    content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else ''
+                    content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else ''
+                    content += f':{r1["ywym"]}' if r1['ywym'].__len__() else ''
+            else:
+                content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
+                content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
+                content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else ''
+                content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else ''
+                content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else ''
+                content += f':{r1["ywym"]}' if len(r1["ywym"]) else ''
+
+            ref_list.append(content)
+
+        return ref_list
+
+    @staticmethod
+    def parse_detail(content: dict, author: List[dict], catation: List[dict]) -> dict:
+        d = dict()
+        d['a_id'] = content.get('id')
+        d['title'] = content.get('lypm')
+        d['title_en'] = content.get('blpm')
+        author_list = spilt_text(content.get('authors', ''), symbol='aaa')
+        d['author'] = join_list(author_list, symbol='/')
+        d['fund'] = content.get('xmlb')
+        d['source'] = content.get('qkmc')
+        org_list = org(author)
+        d['first_org'] = org_list and org_list[0]
+
+        d['org'] = join_list(author_org(author), symbol='/')
+        d['first_author'] = author_list and author_list[0]
+        d['classification_code'] = content.get('xkdm1')
+        year = content.get('nian')
+        d['year'] = year
+        d['volume'] = content.get('juan')
+        d['issue'] = content.get('qi')
+        d['page'] = content.get('ym')
+        keyword_list = spilt_text(content.get('byc'), symbol=';')
+        d['keywords'] = join_list(keyword_list, symbol='/')
+        d['fund'] = content.get('xmlb')
+        d['fund_type'] = content.get('jjlb')
+        d['references'] = FieldAssembly.parse_reference(catation)
+
+        return d
+
+    def parsing(self):
+        pass
+
--- a/science_article_cssci/science_article_cssci/scripts/get_cookie.py
+++ b/science_article_cssci/science_article_cssci/scripts/get_cookie.py
@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/11/12 16:10
+# @Author  : zhaoxiangpeng
+# @File    : get_cookie.py
+# 用来获取session_id
+
+import asyncio
+import json
+from random import random
+
+import redis
+import aiohttp
+
+
+class GetSessionID:
+    __redis_cli = None
+
+    def __init__(self, redis_uri: str, pool_key: str, ttl: int = None, **kwargs):
+        self.redis_uri = redis_uri
+        self.pool_key = pool_key
+        self.ttl = ttl
+
+    @staticmethod
+    async def new_session_id() -> dict:
+        session = aiohttp.ClientSession()
+        resp = await session.get(
+            'http://cssci.nju.edu.cn/index.html',
+            headers={
+                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
+                'host': 'cssci.nju.edu.cn'
+            }
+        )
+        assert resp.status == 200
+        resp = await session.post(
+            'http://cssci.nju.edu.cn/control/controllers.php',
+            headers={
+                'content-type': 'application/x-www-form-urlencoded',
+                'host': 'cssci.nju.edu.cn',
+                'origin': 'http://cssci.nju.edu.cn',
+                'referer': 'http://cssci.nju.edu.cn/index.html',
+                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
+            },
+            data=dict(control='user_control', action='check_user_online', rand=random()),
+        )
+        assert resp.status == 200
+        # 把cookiejar转为dict并返回
+        cookie_obj = dict()
+        cookie_jar = session.cookie_jar.filter_cookies(resp.url)
+        for key, jar in cookie_jar.items():
+            cookie_obj.setdefault(jar.key, jar.value)
+        # 关闭session
+        await session.close()
+        return cookie_obj
+
+    @property
+    def redis_cli(self):
+        if self.__class__.__redis_cli is None:
+            self.__class__.__redis_cli = redis.asyncio.Redis.from_url(
+                self.redis_uri,
+                decode_responses=True
+            )
+        return self.__class__.__redis_cli
+
+    async def set_cookie_to_redis(self, val):
+        result = await self.redis_cli.setex(self.pool_key, time=self.ttl, value=val)
+        return result
+
+    async def get_cookie_from_redis(self, to_dict=True):
+        """
+        :param to_dict: 是否从字符串转为dict
+        :return:
+        """
+        cookie_str = await self.redis_cli.get(self.pool_key)
+        if not cookie_str:
+            return cookie_str
+        if to_dict:
+            return json.loads(cookie_str)
+        return cookie_str
+
+    async def get_cookie_to_redis(self) -> dict:
+        """
+        直接获取cookie并塞到redis
+        :return:
+        """
+        cookie_obj = await self.new_session_id()
+        await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
+        return cookie_obj
+
+    async def test(self):
+        from loguru import logger
+        cookie_obj = await self.new_session_id()
+        logger.info("""
+        cookie:    %s""" % cookie_obj)
+        res = await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
+        logger.info("""
+        插入:    %s""" % res)
+        res = await self.get_cookie_from_redis()
+        logger.info("""
+        获取:    %s""" % res)
+
+    def main(self):
+        asyncio.run(self.test())
+
+
+if __name__ == '__main__':
+    GetSessionID().main()
--- a/science_article_cssci/science_article_cssci/settings.py
+++ b/science_article_cssci/science_article_cssci/settings.py
@ -0,0 +1,110 @@
+# Scrapy settings for science_article_cssci project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "science_article_cssci"
+
+SPIDER_MODULES = ["science_article_cssci.spiders"]
+NEWSPIDER_MODULE = "science_article_cssci.spiders"
+
+ADDONS = {}
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Concurrency and throttling settings
+#CONCURRENT_REQUESTS = 16
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
+DOWNLOAD_DELAY = 1
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+POST_HEADERS_CONFIG = {
+    # 'content-type': 'application/x-www-form-urlencoded',
+    'host': 'cssci.nju.edu.cn',
+    'origin': 'http://cssci.nju.edu.cn',
+    'referer': 'http://cssci.nju.edu.cn/index.html',
+    'user-agent': USER_AGENT
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "science_article_cssci.middlewares.ScienceArticleCssciSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "science_article_cssci.middlewares.ScienceArticleCssciDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "science_article_cssci.pipelines.ScienceArticleCssciPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+FEED_EXPORT_ENCODING = "utf-8"
+
+MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
+MONGO_DATABASE = 'science2'
+
+MONGO_URI_SCIENCE = "mongodb://root:kcidea1509!%25)(@43.140.203.187:27017/"
+MONGO_DATABASE_SCIENCE = 'science'
+
+REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
+
+# cookie redis 配置
+COOKIE_POOL_CONFIG = REDIS_URL
+COOKIE_POOL_REDIS_KEY = 'cookies_pool:cssci:session'
+COOKIE_REDIS_TTL = 60 * 60 * 6
+
+KAFKA_SERVERS = ['hadoop01:9092', 'hadoop02:9092', 'hadoop03:9092']
+KAFKA_TOPIC = "test2kafka"  #
--- a/science_article_cssci/science_article_cssci/spiders/init.py
+++ b/science_article_cssci/science_article_cssci/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/science_article_cssci/science_article_cssci/spiders/cssci_article_by_id.py
+++ b/science_article_cssci/science_article_cssci/spiders/cssci_article_by_id.py
@ -0,0 +1,63 @@
+import re
+import json
+import scrapy
+from scrapy_redis.spiders import RedisSpider
+from scrapy_redis.utils import bytes_to_str
+from science_article_cssci.utils import model
+from science_article_cssci.items import CssciArticleItem
+from science_article_cssci.configs import cssci as config
+
+
+class CssciArticleByIdSpider(RedisSpider):
+    name = "cssci_article_by_id"
+    custom_settings = dict(
+        DOWNLOADER_MIDDLEWARES={
+            "science_article_cssci.middlewares.CssciCookieMiddleware": 540,
+        },
+        ITEM_PIPELINES={
+            "science_article_cssci.pipelines.BuildDetailPipeline": 300,
+            "science_article_cssci.pipelines.MongoPipeline": 310,
+            "science_article_cssci.pipelines.KafkaPipeline": 350,
+        },
+        # LOG_LEVEL="INFO"
+    )
+
+    def make_request_from_data(self, data):
+        data = bytes_to_str(data)
+        data = json.loads(data)
+        third_id = data.get("third_id")
+        yield scrapy.FormRequest(
+            url=config.CSSCI_ARTICLE_DETAIL_API, method="GET",
+            formdata=model.get_article_detail_param(third_id=third_id), callback=self.parse_detail,
+            meta={"third_id": third_id})
+
+    def parse_detail(self, response, **kwargs):
+        def change_qi(tmp_str):
+            return re.sub(r'^0|0$', '', tmp_str)
+
+        def change_string(tmp_str):
+            tmp = tmp_str.split("aaa")
+            tmp_z = []
+            for t in tmp:
+                if len(t) > 1:
+                    tmp_z.append(t)
+            return tmp_z
+
+        # print(response)
+        meta = response.meta
+        third_id = meta['third_id']
+        resp_json = response.json()
+        contents: list = resp_json.get('contents', [])
+        body = [c for c in contents if c.get("sno") == third_id]
+        if body:
+            content = body[0]
+        else:
+            content = {}
+        d = dict(
+            content=content,
+            author=resp_json.get("author"),
+            catation=resp_json.get("catation"),
+        )
+        self.logger.debug(d)
+        article_item = CssciArticleItem(**dict(third_id=third_id, resp_raw=d))
+        yield article_item
--- a/science_article_cssci/science_article_cssci/utils/init.py
+++ b/science_article_cssci/science_article_cssci/utils/init.py
@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/11/13 11:19
+# @Author  : zhaoxiangpeng
+# @File    : __init__.py
--- a/science_article_cssci/science_article_cssci/utils/model.py
+++ b/science_article_cssci/science_article_cssci/utils/model.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2024/11/13 14:47
+# @Author  : zhaoxiangpeng
+# @File    : model.py
+
+import math
+from random import random
+from datetime import datetime
+from urllib.parse import quote
+
+CURRENT_YEAR = datetime.today().year
+
+
+def new_session_key() -> int:
+    """
+    起初以为是服务器返回的
+    每次搜索刷新，因为加载了html，浏览器也么有判断
+    var session_key=Math.floor(Math.random()*1000)
+    :return:
+    """
+    return math.floor(random() * 1000)
+
+
+def get_search_list_param(
+        query: str,
+        page: int = 1,
+        year: int = CURRENT_YEAR,
+        session_key: int = None,
+        **kwargs
+):
+    """
+    :param query:
+    :param page:
+    :param year:
+    :param session_key: 每次搜索会出一个html，在html中生成，每次翻页不会变化
+    :param kwargs:
+    :return:
+    """
+    param = {'control': 'search_base', 'action': 'search_lysy',
+             'title': quote(query, safe='/', encoding='utf-8'),
+             'xkfl1': '',
+             'wzlx': '', 'qkname': '', 'type': '', 'jj': '', 'start_year': 1998, 'end_year': CURRENT_YEAR, 'nian': year,
+             'juan': '',
+             'qi': '', 'xw1': '', 'xw2': '', 'pagesize': 20, 'pagenow': page, 'order_type': 'nian', 'order_px': 'ASC',
+             'search_tag': 1, 'session_key': session_key, 'rand': random().__str__()}
+    return param
+
+
+def get_article_detail_param(third_id=None):
+    return dict(control='search',
+                action='source_id',
+                id=third_id,
+                rand=random().__str__())
--- a/science_article_cssci/science_article_cssci/utils/search_type_config.py
+++ b/science_article_cssci/science_article_cssci/utils/search_type_config.py
@ -0,0 +1,91 @@
+search_type = {
+    15: '所有字段',
+    1: '篇名(词)',
+    17: '英文篇名',
+    # 2:'篇名(词)(精确)',
+    3: '作者',
+    # 4:'作者(精确)',
+    5: '作者(第一作者)',
+    # 16:"作者(第一作者+精确)",
+    6: "关键词",
+    # 7:"关键词(精确)",
+    8: '期刊名称',
+    # 9:'期刊名称(精确)',
+    10: '作者机构',
+    # 11:'作者机构(第一机构)',
+    # 12:'作者地区',
+    13: '中图类号',
+    14: '基金细节'
+}
+
+search_type_z = {
+    15: '所有字段',
+    1: '篇名(词)',
+    17: '英文篇名',
+    2: '篇名(词)(精确)',
+    3: '作者',
+    4: '作者(精确)',
+    5: '作者(第一作者)',
+    16: "作者(第一作者+精确)",
+    6: "关键词",
+    7: "关键词(精确)",
+    8: '期刊名称',
+    9: '期刊名称(精确)',
+    10: '作者机构',
+    11: '作者机构(第一机构)',
+    12: '作者地区',
+    13: '中图类号',
+    18: '期刊名称(精确)',
+    14: '基金细节'
+}
+
+search_type_s = {
+    15: '所有字段',
+    1: '篇名(词)',
+    17: '英文篇名',
+    # 2:'篇名(词)(精确)',
+    3: '作者',
+    # 4:'作者(精确)',
+    # 5:'作者(第一作者)',
+    # 16:"作者(第一作者+精确)",
+    6: "关键词",
+    # 7:"关键词(精确)",
+    8: '期刊名称',
+    # 9:'期刊名称(精确)',
+    10: '作者机构',
+    # 11:'作者机构(第一机构)',
+    12: '作者地区',
+    13: '中图类号',
+    14: '基金细节'
+}
+search_type_ly = {
+    4: '被引作者(精确)',
+    3: '被引作者',
+    5: '被引作者(排除自引)',
+    1: '被引篇名(词)',
+    2: '被引篇名(词)(精确)',
+    6: '被引期刊名称',
+    7: '被引期刊名称(精确)',
+    # 8:'期刊名称(排除自引)',
+    9: '被引文献细节'
+}
+
+search_type_ly_x = {
+    # 4:'被引作者(精确)',
+    3: '被引作者',
+    5: '被引作者(排除自引)',
+    1: '被引篇名(词)',
+    # 2:'被引篇名(词)(精确)',
+    6: '被引期刊名称',
+    # 7:'被引期刊名称(精确)',
+    # 8:'期刊名称(排除自引)',
+    9: '被引文献细节'
+}
+
+order_value = {
+    'nian': "年代",
+    'lypm ': "篇名(词)",
+    # 'nian':"被引次数",
+    'bz': "作者"
+    # 'nian':"相关度"
+}