diff --git a/science_article_wos/a.py b/science_article_cssci/science_article_cssci/__init__.py similarity index 100% rename from science_article_wos/a.py rename to science_article_cssci/science_article_cssci/__init__.py diff --git a/science_article_cssci/science_article_cssci/configs/cssci.py b/science_article_cssci/science_article_cssci/configs/cssci.py new file mode 100644 index 0000000..90525b5 --- /dev/null +++ b/science_article_cssci/science_article_cssci/configs/cssci.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/19 16:51 +# @Author : zhaoxiangpeng +# @File : cssci.py +# 数据来源名 +SOURCE_NAME = 'cssci' + + +# api配置 +CSSCI_CONTROL_API = 'http://cssci.nju.edu.cn/control/controllers.php' +# 搜索列表的接口 +CSSCI_SEARCH_API = CSSCI_CONTROL_API +# 详情页接口 +CSSCI_ARTICLE_DETAIL_API = CSSCI_CONTROL_API + +# 请求headers配置 +POST_HEADERS_CONFIG = { + 'content-type': 'application/x-www-form-urlencoded', + 'host': 'cssci.nju.edu.cn', + 'origin': 'http://cssci.nju.edu.cn', + 'referer': 'http://cssci.nju.edu.cn/index.html', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' +} + +# 任务条件 +TASK_CONDITION_CONFIG = dict( + task_table='task_search_strategy', + task_field=["id", "content", "param"], + task_condition="source_type = 8", + db_type="mysql", + batch_limit=1 +) diff --git a/science_article_cssci/science_article_cssci/db_utils/__init__.py b/science_article_cssci/science_article_cssci/db_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/science_article_cssci/science_article_cssci/db_utils/kafka.py b/science_article_cssci/science_article_cssci/db_utils/kafka.py new file mode 100644 index 0000000..b895899 --- /dev/null +++ b/science_article_cssci/science_article_cssci/db_utils/kafka.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/16 14:11 +# @Author : zhaoxiangpeng +# @File : kafka.py +import time +import logging +from aiokafka import AIOKafkaProducer +from kafka.errors import KafkaError +logger = logging.getLogger(__name__) + + +class KafkaUtil: + MAX_RETRIES = 3 + BASE_DELAY = 2 # 秒:指数退避基础 + + def __init__(self, producer): + self._kafka_producer: AIOKafkaProducer = producer + self.logger = logging.getLogger(__name__) + + async def producer_send(self, data, topic): + try: + future = await self._kafka_producer.send(topic, value=data) + future.add_callback(lambda metadata: self.on_send_success(metadata, data)) + future.add_errback(self.on_send_error_factory(data, topic)) + except KafkaError as e: + self.logger.error(f'{data.get("id", "")} - 生产失败\n' + f'失败原因:{e}') + + def on_send_success(self, record_metadata, data): + msg_id = data.get("id", "unknown_id") + self.logger.info( + f"{msg_id} - 成功发送到: " + f"topic={record_metadata.topic} - partition={record_metadata.partition} - offset={record_metadata.offset}") + + def on_send_error_factory(self, data, topic, retries=0): + msg_id = data.get("id", "unknown_id") + + def on_send_error(exc): + self.logger.info(f"{msg_id} - 第 {retries + 1} 次发送失败: {exc}") + if retries < self.MAX_RETRIES: + delay = self.BASE_DELAY * (2 ** retries) + self.logger.info(f"{msg_id} - {delay}s 后重试,第 {retries + 1}/{self.MAX_RETRIES} 次") + time.sleep(delay) + future = self._kafka_producer.send(topic, value=data) + future.add_callback(lambda metadata: self.on_send_success(metadata, data)) + future.add_errback(self.on_send_error_factory(data, topic, retries + 1)) + else: + self.logger.error(f"{msg_id} - 超过最大重试次数, 请检查") + + return on_send_error diff --git a/science_article_cssci/science_article_cssci/db_utils/mongo.py b/science_article_cssci/science_article_cssci/db_utils/mongo.py new file mode 100644 index 0000000..1fc5f7b --- /dev/null +++ b/science_article_cssci/science_article_cssci/db_utils/mongo.py @@ -0,0 +1,92 @@ +from __future__ import annotations +import logging +from typing import TYPE_CHECKING, Optional, Dict, Tuple +from pymongo import MongoClient +from pymongo import UpdateOne +from pymongo.errors import DuplicateKeyError, BulkWriteError + +if TYPE_CHECKING: + from pymongo.database import Database + from pymongo.collection import Collection + from pymongo.results import InsertOneResult, InsertManyResult, BulkWriteResult + + +def build_update_query(update_data: dict, replace: bool = True) -> dict: + """ + 如果replace为True,则直接覆盖原有的document + """ + update_query = {} + if not update_data: + return {} + for key, val in update_data.items(): + if replace: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + else: + if isinstance(val, list): + update_query.setdefault( + "$addToSet", {} + ).update({ + key: {"$each": val} + }) + else: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + return update_query + + +def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]: + update_query = {} + if not update_data: + return {}, {} + + for key, val in update_data.items(): + if replace: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + else: + if isinstance(val, list): + update_query.setdefault( + "$addToSet", {} + ).update({ + key: {"$each": val} + }) + else: + update_query.setdefault( + "$set", {} + ).update( + {key: val} + ) + return filter_query, update_query + + +class MongoDBUtils: + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + self.client: MongoClient = None + self.db: Database = None + + def insert2db(self, items, tablename, **kwargs) -> InsertOneResult: + collection: Collection = self.db.get_collection(tablename) + result: InsertOneResult = collection.insert_one(items, **kwargs) + return result + + def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult: + collection: Collection = self.db.get_collection(tablename) + result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs) + return result + + def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult: + collection: Collection = self.db.get_collection(tablename) + bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs) + return bulk_results diff --git a/science_article_cssci/science_article_cssci/items.py b/science_article_cssci/science_article_cssci/items.py new file mode 100644 index 0000000..df47228 --- /dev/null +++ b/science_article_cssci/science_article_cssci/items.py @@ -0,0 +1,34 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class ScienceArticleCssciItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + + +class AddItemBase(scrapy.Item): + third_id = scrapy.Field() + updated_at = scrapy.Field() + + +class ArticleItem(AddItemBase): + exported = scrapy.Field() + + +class ArticleCitedItem(AddItemBase): + cited = scrapy.Field() + + +class CssciArticleItem(ArticleItem): + __tablename__ = 'data_cssci_article' + + third_id = scrapy.Field() + resp_raw = scrapy.Field() + detailed = scrapy.Field() + updated_at = scrapy.Field() diff --git a/science_article_cssci/science_article_cssci/middlewares.py b/science_article_cssci/science_article_cssci/middlewares.py new file mode 100644 index 0000000..a080ec0 --- /dev/null +++ b/science_article_cssci/science_article_cssci/middlewares.py @@ -0,0 +1,143 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import logging +from typing import Optional +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +from scrapy.http.headers import Headers + +from science_article_cssci.scripts.get_cookie import GetSessionID +logger = logging.getLogger(__name__) + + +class ScienceArticleCssciSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # maching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ScienceArticleCssciDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class CssciCookieMiddleware: + ss: Optional[GetSessionID] + + def __init__(self, custom_headers: dict, cookie_cfg: dict): + self.custom_headers = custom_headers + self.headers = Headers(self.custom_headers) + self.cookies_pool_config = cookie_cfg + + @classmethod + def from_crawler(cls, crawler): + settings = crawler.settings + post_headers = crawler.settings.getdict('POST_HEADERS_CONFIG') + s = cls( + custom_headers=post_headers, + cookie_cfg=dict( + redis_uri=settings.get("COOKIE_POOL_CONFIG"), + pool_key=settings.get("COOKIE_POOL_REDIS_KEY"), + ttl=settings.get("COOKIE_REDIS_TTL") + ) + ) + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def spider_opened(self, spider): + self.ss = GetSessionID(**self.cookies_pool_config) + + async def process_request(self, request, spider): + cookie_1 = await self.ss.get_cookie_from_redis() + if not cookie_1: + cookie_1 = await self.ss.get_cookie_to_redis() + logger.info(""" + 没有可用cookie + 重新获取: %s""" % cookie_1) + + request.cookies = cookie_1 + request.headers = self.headers diff --git a/science_article_cssci/science_article_cssci/pipelines.py b/science_article_cssci/science_article_cssci/pipelines.py new file mode 100644 index 0000000..8ebc9bb --- /dev/null +++ b/science_article_cssci/science_article_cssci/pipelines.py @@ -0,0 +1,275 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# useful for handling different item types with a single interface +from __future__ import annotations +import re +import json +import logging +from datetime import datetime +from typing import TYPE_CHECKING, Tuple, Union, Optional +import scrapy +from itemadapter import ItemAdapter +from kafka import KafkaProducer +from pymongo import MongoClient +from pymongo.errors import ( + DuplicateKeyError, + BulkWriteError +) +from science_article_cssci.db_utils.mongo import MongoDBUtils, build_update_query +from science_article_cssci.db_utils.kafka import KafkaUtil +from science_article_cssci.scripts.field_assembly import FieldAssembly + +if TYPE_CHECKING: + from scrapy.crawler import Crawler + from scrapy.statscollectors import StatsCollector + from pymongo.collection import Collection + +mongo_logger = logging.getLogger('pymongo') +mongo_logger.setLevel(logging.WARNING) +logger = logging.getLogger(__name__) + + +class ScienceArticleCssciPipeline: + def process_item(self, item, spider): + return item + + +class MongoPipeline(MongoDBUtils): + def __init__(self, mongo_uri, mongo_db, stats: StatsCollector): + super().__init__(mongo_uri, mongo_db) + self.stats: StatsCollector = stats + self.insert_failure_update_enable = True + self.duplicate_cover_enable = False # 重复项覆盖 + + @classmethod + def from_crawler(cls, crawler: Crawler): + m = cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + stats=crawler.stats + ) + return m + + def open_spider(self, spider): + self.client = MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def process_item(self, item, spider) -> scrapy.Item: + """ + 插入遇到错误不处理 + """ + adapter = ItemAdapter(item) + tablename = self._get_item_table(item) + collection = self.db.get_collection(tablename) + d = adapter.asdict() + try: + collection.insert_one(d) + self.stats.inc_value("item2db_inserted/{}".format(tablename)) + except DuplicateKeyError as duplicate_error: + self.stats.inc_value("item2db_duplicate/{}".format(tablename)) + self.stats.inc_value(f"item_dropped_reasons_count/duplicate") + except Exception: + raise + return item + + def process_item_update(self, item, spider) -> scrapy.Item: + """ + 插入遇到错误进行更新 + """ + adapter = ItemAdapter(item) + tablename = self._get_item_table(item) + collection = self.db.get_collection(tablename) + d = adapter.asdict() + try: + collection.insert_one(d) + self.stats.inc_value("item2db_inserted/{}".format(tablename)) + except DuplicateKeyError as duplicate_error: + if self.insert_failure_update_enable: + write_error = duplicate_error.details + filter_query, update_query = self._pick_filter_update(write_error, doc=d) + updated_at_query = None # 删除不确定因素的时间防止影响更新(更新除了task_id外的字段不需要处理这个) + + key_pattern = write_error.get('keyPattern') + key_value = write_error.get('keyValue') + logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value) + + # 专门用来适配增量的任务 + task_ids = update_query.pop("task_ids", None) + if task_ids: + # task_id一定会引起变动,所以先处理 + task_id_query = {'task_ids': task_ids} + collection.update_one(filter=filter_query, update=build_update_query(task_id_query, replace=False)) + updated_at_query = {"updated_at": update_query.pop('updated_at', None)} + + update_q = build_update_query(update_query, replace=self.duplicate_cover_enable) + up_result = collection.update_one(filter=key_value, update=update_q, upsert=True) + if up_result.matched_count == up_result.modified_count == 1: + # 变动了就要修改更新的时间(其实没变也要更新,这样可以知道什么时候动过这条数据) + if updated_at_query: + collection.update_one(filter=key_value, update={"$set": updated_at_query}) + self.stats.inc_value("item2db_updated/{}".format(tablename)) + except Exception: + raise + + return item + + @staticmethod + def _pick_filter_update(write_error, doc: dict = None): + original_doc = write_error.get('op', doc) # 插入的数据 + key_pattern = write_error.get('keyPattern') + original_doc.pop("_id", None) # 删掉插入失败产生的_id + filter_query = {} + update_query = {key: val for key, val in original_doc.items() if val} + + for key in key_pattern.keys(): + filter_query.update({key: update_query.pop(key, None)}) + return filter_query, update_query + + def close_spider(self, spider): + self.client.close() + + @staticmethod + def _get_item_table(item) -> str: + """获取Item类型""" + if hasattr(item, '__tablename__'): + return item.__class__.__tablename__ + return 'items_null_table' + + +class Mongo2SciencePipeline(MongoPipeline): + def process_item(self, item, spider): + d = self.parse2science(item) + table = self._get_item_table(item) + coll = self.db.get_collection(table) + return d + + @staticmethod + def parse2science(item) -> dict: + def change_qi(tmp_str): + return re.sub(r'^0|0$', '', tmp_str) + + def change_string(tmp_str): + tmp = tmp_str.split("aaa") + tmp_z = [] + for t in tmp: + if len(t) > 1: + tmp_z.append(t) + return tmp_z + + adapter = ItemAdapter(item) + third_id = adapter['third_id'] + resp_raw = adapter['resp_raw'] + resp_raw = json.loads(resp_raw) + authors: list = resp_raw['author'] + authors = [dict(zzmc=au_info['zzmc'], jgmc=au_info['jgmc'], bmmc=au_info['bmmc']) for au_info in + authors or []] # 作者.学校.院系 + # pprint(authors) + catations: list = resp_raw.get('catation') + contents: list = resp_raw.get('contents', []) + body = [c for c in contents if c.get("sno") == adapter['third_id']] + if body: + content = body[0] + else: + content = {} + d = dict( + sno=third_id, + lypm=content['lypm'], # 篇名 + blpm=content['blpm'], # 英文篇名 + zzjg=authors, # 作者和机构 + wzlx=content['wzlx'], # 文献类型 + xkfl1=content['xkfl1'], # 学科类别1 + xkfl2=content.get('xkfl2', ''), # 学科类别2 + xkdm1=content['xkdm1'], # 中图类号1 + xkdm2=content.get('xkdm2', ''), # 中图类号2 + xmlb=content.get('xmlb', ''), # 基金项目 + qkmc=content.get('qkmc', ''), # 来源期刊 + # (nian)年(juan)卷第(qi).replace(/^0|0$/g,'')期:(ym) + nian=content.get('nian', ''), # 年 + juan=content.get('juan', ''), # 卷 + qi=content.get('qi', ''), # 期 + ym=content.get('ym', ''), # 页码 + byc=change_string(content.get('byc', '')), # 关键词 + ckwx=catations # 参考文献 + ) + return d + + +class BuildDetailPipeline: + def process_item(self, item, spider): + adapter = ItemAdapter(item) + item['detailed'] = self.build_detailed(adapter) + + return item + + @staticmethod + def build_detailed(item): + resp_raw = item.get("resp_raw") + dd = dict( + **FieldAssembly.parse_detail( + content=resp_raw.get('content'), + author=resp_raw.get('author'), + catation=resp_raw.get('catation'), + ) + ) + return dd + + +class KafkaPipeline: + def __init__(self, kafka_servers, topic): + self.kafka_servers = kafka_servers + self.topic = topic + self.producer: KafkaProducer = None + + @classmethod + def from_crawler(cls, crawler): + return cls( + kafka_servers=crawler.settings.get('KAFKA_SERVERS', 'localhost:9092'), + topic=crawler.settings.get('KAFKA_TOPIC', 'scrapy_items') + ) + + def open_spider(self, spider): + self.producer: KafkaProducer = KafkaProducer( + bootstrap_servers=self.kafka_servers, + value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'), + acks='all', + linger_ms=50, # 等待时间,最多等待 50ms 封包发送 + compression_type='gzip', + ) + spider.logger.info(f"Connected to Kafka at {self.kafka_servers}") + + def close_spider(self, spider): + if self.producer: + self.producer.flush() + self.producer.close() + spider.logger.info("Kafka connection closed") + + def process_item(self, item, spider): + adapter = ItemAdapter(item) + d = adapter.asdict() + d = self.build2kafka(d) + # 发送到Kafka + future = self.producer.send( + topic=self.topic, + value=d, + headers=[{'source_type': b'cssci'}] + ) + future.add_callback(self.on_send_success) + future.add_callback(self.on_send_success) + return item + + def on_send_success(self, record_metadata): + """发送成功回调""" + pass + + def on_send_error(self, excp): + """发送失败回调""" + pass + + def build2kafka(self, item: dict) -> dict: + dd = dict( + id=item.get("third_id"), + **item.get('detailed') + ) + return dd diff --git a/science_article_cssci/science_article_cssci/scripts/config.ini b/science_article_cssci/science_article_cssci/scripts/config.ini new file mode 100644 index 0000000..d298038 --- /dev/null +++ b/science_article_cssci/science_article_cssci/scripts/config.ini @@ -0,0 +1,62 @@ +[source_type] +1 = 论文 +2 = 综述 +3 = 评论 +5 = 报告 +4 = 传记资料 +9 = 其他 + +[source_jj] +1 = 国家自科基金 +2 = 国家社科基金 +3 = 国家级其它基金 +4 = 教育部基金 +5 = 其他部委级基金 +6 = 中科院基金 +7 = 社科院基金 +8 = 省(市)级基金 +9 = 其它基金 + +[yw_type] +1 = 期刊论文 +11 = 电子文献 +10 = 法规 +9 = 标准 +8 = 报告 +7 = 汇编 +6 = 信件 +5 = 学位论文 +4 = 会议文献 +3 = 报纸 +2 = 图书 +99 = 其他 + +[source_xk] +630 = 管理学 +850 = 民族学 +860 = 新闻学与传播学 +870 = 图书馆、情报与文献学 +880 = 教育学 +890 = 体育学 +910 = 统计学 +920 = 心理学 +930 = 社会科学总论 +940 = 军事学 +950 = 文化学 +960 = 人文、经济地理 +970 = 环境科学 +840 = 社会学 +820 = 法学 +710 = 马克思主义 +720 = 哲学 +730 = 宗教学 +740 = 语言学 +009 = 文学 +751 = 外国文学 +752 = 中国文学 +760 = 艺术学 +770 = 历史学 +780 = 考古学 +790 = 经济学 +810 = 政治学 +999 = 其他学科 \ No newline at end of file diff --git a/science_article_cssci/science_article_cssci/scripts/data_export.py b/science_article_cssci/science_article_cssci/scripts/data_export.py new file mode 100644 index 0000000..5e906a6 --- /dev/null +++ b/science_article_cssci/science_article_cssci/scripts/data_export.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/1/12 8:36 +# @Author : ZhaoXiangPeng +# @File : data_export.py + +import os +import re +import time +from typing import List +from configparser import ConfigParser, NoSectionError + +cfg_parse = ConfigParser() +try: + cfg_parse.read(os.path.join(os.path.dirname(__file__), 'config.ini'), encoding='utf-8') + print(cfg_parse) +except NoSectionError: + pass +else: + source_type = cfg_parse['source_type'] + source_jj = cfg_parse['source_jj'] + yw_type = cfg_parse['yw_type'] + source_xk = cfg_parse['source_xk'] + + +def ckwx(wxs: list): + wx_list = [] + if wxs is None: + return wx_list + for i in range(wxs.__len__()): + content = '' + content += str(i+1) + ' ' + r1: dict = wxs[i] + if r1['ywlx'] == '1': # 期刊论文 + tmp = r1['ywcc'].replace(r1['ywnd']+',', '').replace(r1['ywnd']+'.', '').replace(r1['ywnd']+'(','(').replace(r1['ywnd']+',','') + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else '' + content += f',{tmp}' + elif r1['ywlx'] == '3': # 报纸 + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}' + elif r1['ywlx'] == '11': # 电子文献,网站 + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else '' + elif r1['ywlx'] == '99': # 其它 + content += r1['ywpm'] + elif r1['ywlx'] == '7': + content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else '' + content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else '' + if len(r1["ywcc"]): + content += '.'+r1["ywcc"] + else: + content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else '' + content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else '' + content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else '' + content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else '' + content += f':{r1["ywym"]}' if r1['ywym'].__len__() else '' + else: + content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else '' + content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else '' + content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else '' + content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else '' + content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else '' + content += f':{r1["ywym"]}' if len(r1["ywym"]) else '' + wx_list.append(content) + return wx_list + + +def process_ref_field(row: dict, to_qikan=False) -> List[dict]: + row_list = [] + ckwx_list = ckwx(row.get('ckwx', [])) + for ckwx_one in ckwx_list: + new_row = {} + new_row['rid'] = None + new_row['reference'] = ckwx_one + new_row['aid'] = row.get('sno') + new_row['url'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno') + row_list.append(new_row) + return row_list + + +def process_field(row: dict, to_qikan=False): + def jtrim(x): + return re.sub(r'^(\s|\u00A0)+|(\s|\u00A0)+$', '', x or '') + + def zzjg(jgs: list): + zzjg_list = [] + for i in range(jgs.__len__()): + r1: dict = jgs[i] + vals: list = [str(i+1)] + list(r1.values()) + zzjg_list.append('.'.join(vals)) + return zzjg_list + + def vpzz(jgs: list): + zz_list = [] + for i in range(jgs.__len__()): + r1: dict = jgs[i] + zz_list.append(r1['zzmc']+f'[{i+1}]') + return zz_list + + def vpjg(jgs: list): + jg_list = [] + for i in range(jgs.__len__()): + r1: dict = jgs[i] + jg_list.append(f'[{i+1}]'+r1['jgmc']+r1['bmmc']) + return jg_list + + def wxlx(t: str): + return source_type.get(t) + + def xkfl(t: str): + return source_xk.get(t) + + def change_qi(tmp_str): + return re.sub(r'^0|0$', '', tmp_str) + + def ndjq(n, j, q, ym): + val = '' + if len(n): + val += f'{n}年' + if len(j): + val += f'{j}卷' + if len(q): + q = change_qi(q) + val += f'第{q}期' + if len(ym): + val += f':{ym}' + return val + + new_row = {} + if to_qikan is False: + new_row['篇名'] = row.get('lypm') + new_row['英文篇名'] = row.get('blpm') + new_row['作者及机构'] = '; '.join(zzjg(row.get('zzjg', []))) + new_row['文献类型'] = wxlx(row.get('wzlx', '')) + fl1 = xkfl(row.get('xkfl1', '')) + fl2 = xkfl(row.get('xkfl2', '')) + new_row['学科类别'] = (fl1 if fl1 is not None else '') + ('/' + fl2 if fl2 is not None else '') + lh1 = row.get('xkdm1', '') + lh2 = row.get('xkdm2', '') + new_row['中图类号'] = (lh1 if len(lh1) else '') + ('/' + lh2 if len(lh2) else '') + new_row['基金项目'] = row.get('xmlb') + new_row['来源期刊'] = row.get('qkmc') + new_row['年代卷期'] = ndjq(row.get('nian'), row.get('juan'), row.get('qi'), row.get('ym')) + new_row['关键词'] = '/'.join(row.get('byc', [])) + new_row['参考文献'] = '; '.join(ckwx(row.get('ckwx', []))) + else: + new_row['序号'] = 1 + new_row['题名'] = row.get('lypm') + jgzz = row.get('zzjg', []) + new_row['作者'] = ';'.join(vpzz(jgzz)) + new_row['机构'] = ';'.join(vpjg(jgzz)) + new_row['基金'] = row.get('xmlb') + new_row['刊名'] = row.get('qkmc') + new_row['年'] = row.get('nian') + new_row['卷'] = row.get('juan') + new_row['期'] = change_qi(row.get('qi')) + new_row['ISSN号'] = None + new_row['CN号'] = None + new_row['页码'] = str(row.get('ym')) + new_row['关键词'] = ';'.join(row.get('byc', [])) + lh1 = row.get('xkdm1', '') + lh2 = row.get('xkdm2', '') + new_row['分类号'] = (lh1 if len(lh1) else '') + (';' + lh2 if len(lh2) else '') + new_row['文摘'] = None + new_row['网址'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno') + new_row['aid'] = row.get('sno') + new_row['FileName'] = None + new_row['引文'] = '; '.join(ckwx(row.get('ckwx', []))) + new_row['被引量'] = None + return new_row diff --git a/science_article_cssci/science_article_cssci/scripts/field_assembly.py b/science_article_cssci/science_article_cssci/scripts/field_assembly.py new file mode 100644 index 0000000..a21841f --- /dev/null +++ b/science_article_cssci/science_article_cssci/scripts/field_assembly.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/19 17:22 +# @Author : zhaoxiangpeng +# @File : field_assembly.py +from __future__ import annotations +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from typing import List + + +def strip_symbol(text, symbol='aaa') -> str: + return text.strip(symbol) + + +def join_list(texts: list, symbol='/'): + return symbol.join(texts) + + +def spilt_text(text, symbol='aaa') -> List[str]: + r = [] + if not text: + return r + text = strip_symbol(text) + texts = text.split(symbol) + for t in texts: + t = strip_symbol(t) + if t: + r.append(t) + return r + + +def author_org(datas: list[dict]) -> list: + r = [] + for data in datas: + sno = data.get('sno') + au_name = data.get('zzmc') + org_name = data.get('jgmc') + o = [org_name] + xueyuan = data.get('bmmc') + if xueyuan: + o.append(xueyuan) + r.append(f'[{au_name}]'+join_list(o, '.')) + return r + + +def org(datas: list[dict]) -> list: + r = [] + for data in datas: + org_name = data.get('jgmc') + r.append(org_name) + return r + + +class ArticleAbs: + def to_dict(self): + pass + + +class FieldAssembly: + + @staticmethod + def parse_author(): + return + + @staticmethod + def parse_reference(refs: List[dict]): + ref_list = [] + for i, r1 in enumerate(refs): + content = '' + content += str(i + 1) + '.' + if r1['ywlx'] == '1': # 期刊论文 + tmp = r1['ywcc'].replace(r1['ywnd'] + ',', '').replace(r1['ywnd'] + '.', '').replace(r1['ywnd'] + '(', + '(').replace( + r1['ywnd'] + ',', '') + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else '' + content += f',{tmp}' + elif r1['ywlx'] == '3': # 报纸 + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}' + elif r1['ywlx'] == '11': # 电子文献,网站 + content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}' + content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else '' + elif r1['ywlx'] == '99': # 其它 + content += r1['ywpm'] + elif r1['ywlx'] == '7': + content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else '' + content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else '' + if len(r1["ywcc"]): + content += '.' + r1["ywcc"] + else: + content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else '' + content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else '' + content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else '' + content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else '' + content += f':{r1["ywym"]}' if r1['ywym'].__len__() else '' + else: + content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else '' + content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else '' + content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else '' + content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else '' + content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else '' + content += f':{r1["ywym"]}' if len(r1["ywym"]) else '' + + ref_list.append(content) + + return ref_list + + @staticmethod + def parse_detail(content: dict, author: List[dict], catation: List[dict]) -> dict: + d = dict() + d['a_id'] = content.get('id') + d['title'] = content.get('lypm') + d['title_en'] = content.get('blpm') + author_list = spilt_text(content.get('authors', ''), symbol='aaa') + d['author'] = join_list(author_list, symbol='/') + d['fund'] = content.get('xmlb') + d['source'] = content.get('qkmc') + org_list = org(author) + d['first_org'] = org_list and org_list[0] + + d['org'] = join_list(author_org(author), symbol='/') + d['first_author'] = author_list and author_list[0] + d['classification_code'] = content.get('xkdm1') + year = content.get('nian') + d['year'] = year + d['volume'] = content.get('juan') + d['issue'] = content.get('qi') + d['page'] = content.get('ym') + keyword_list = spilt_text(content.get('byc'), symbol=';') + d['keywords'] = join_list(keyword_list, symbol='/') + d['fund'] = content.get('xmlb') + d['fund_type'] = content.get('jjlb') + d['references'] = FieldAssembly.parse_reference(catation) + + return d + + def parsing(self): + pass + diff --git a/science_article_cssci/science_article_cssci/scripts/get_cookie.py b/science_article_cssci/science_article_cssci/scripts/get_cookie.py new file mode 100644 index 0000000..673999b --- /dev/null +++ b/science_article_cssci/science_article_cssci/scripts/get_cookie.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/11/12 16:10 +# @Author : zhaoxiangpeng +# @File : get_cookie.py +# 用来获取session_id + +import asyncio +import json +from random import random + +import redis +import aiohttp + + +class GetSessionID: + __redis_cli = None + + def __init__(self, redis_uri: str, pool_key: str, ttl: int = None, **kwargs): + self.redis_uri = redis_uri + self.pool_key = pool_key + self.ttl = ttl + + @staticmethod + async def new_session_id() -> dict: + session = aiohttp.ClientSession() + resp = await session.get( + 'http://cssci.nju.edu.cn/index.html', + headers={ + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', + 'host': 'cssci.nju.edu.cn' + } + ) + assert resp.status == 200 + resp = await session.post( + 'http://cssci.nju.edu.cn/control/controllers.php', + headers={ + 'content-type': 'application/x-www-form-urlencoded', + 'host': 'cssci.nju.edu.cn', + 'origin': 'http://cssci.nju.edu.cn', + 'referer': 'http://cssci.nju.edu.cn/index.html', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36' + }, + data=dict(control='user_control', action='check_user_online', rand=random()), + ) + assert resp.status == 200 + # 把cookiejar转为dict并返回 + cookie_obj = dict() + cookie_jar = session.cookie_jar.filter_cookies(resp.url) + for key, jar in cookie_jar.items(): + cookie_obj.setdefault(jar.key, jar.value) + # 关闭session + await session.close() + return cookie_obj + + @property + def redis_cli(self): + if self.__class__.__redis_cli is None: + self.__class__.__redis_cli = redis.asyncio.Redis.from_url( + self.redis_uri, + decode_responses=True + ) + return self.__class__.__redis_cli + + async def set_cookie_to_redis(self, val): + result = await self.redis_cli.setex(self.pool_key, time=self.ttl, value=val) + return result + + async def get_cookie_from_redis(self, to_dict=True): + """ + :param to_dict: 是否从字符串转为dict + :return: + """ + cookie_str = await self.redis_cli.get(self.pool_key) + if not cookie_str: + return cookie_str + if to_dict: + return json.loads(cookie_str) + return cookie_str + + async def get_cookie_to_redis(self) -> dict: + """ + 直接获取cookie并塞到redis + :return: + """ + cookie_obj = await self.new_session_id() + await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False)) + return cookie_obj + + async def test(self): + from loguru import logger + cookie_obj = await self.new_session_id() + logger.info(""" + cookie: %s""" % cookie_obj) + res = await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False)) + logger.info(""" + 插入: %s""" % res) + res = await self.get_cookie_from_redis() + logger.info(""" + 获取: %s""" % res) + + def main(self): + asyncio.run(self.test()) + + +if __name__ == '__main__': + GetSessionID().main() diff --git a/science_article_cssci/science_article_cssci/settings.py b/science_article_cssci/science_article_cssci/settings.py new file mode 100644 index 0000000..5bc6a08 --- /dev/null +++ b/science_article_cssci/science_article_cssci/settings.py @@ -0,0 +1,110 @@ +# Scrapy settings for science_article_cssci project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "science_article_cssci" + +SPIDER_MODULES = ["science_article_cssci.spiders"] +NEWSPIDER_MODULE = "science_article_cssci.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 1 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} +POST_HEADERS_CONFIG = { + # 'content-type': 'application/x-www-form-urlencoded', + 'host': 'cssci.nju.edu.cn', + 'origin': 'http://cssci.nju.edu.cn', + 'referer': 'http://cssci.nju.edu.cn/index.html', + 'user-agent': USER_AGENT +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "science_article_cssci.middlewares.ScienceArticleCssciSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "science_article_cssci.middlewares.ScienceArticleCssciDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "science_article_cssci.pipelines.ScienceArticleCssciPipeline": 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" + +MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true" +MONGO_DATABASE = 'science2' + +MONGO_URI_SCIENCE = "mongodb://root:kcidea1509!%25)(@43.140.203.187:27017/" +MONGO_DATABASE_SCIENCE = 'science' + +REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' + +# cookie redis 配置 +COOKIE_POOL_CONFIG = REDIS_URL +COOKIE_POOL_REDIS_KEY = 'cookies_pool:cssci:session' +COOKIE_REDIS_TTL = 60 * 60 * 6 + +KAFKA_SERVERS = ['hadoop01:9092', 'hadoop02:9092', 'hadoop03:9092'] +KAFKA_TOPIC = "test2kafka" # diff --git a/science_article_cssci/science_article_cssci/spiders/__init__.py b/science_article_cssci/science_article_cssci/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/science_article_cssci/science_article_cssci/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/science_article_cssci/science_article_cssci/spiders/cssci_article_by_id.py b/science_article_cssci/science_article_cssci/spiders/cssci_article_by_id.py new file mode 100644 index 0000000..197720f --- /dev/null +++ b/science_article_cssci/science_article_cssci/spiders/cssci_article_by_id.py @@ -0,0 +1,63 @@ +import re +import json +import scrapy +from scrapy_redis.spiders import RedisSpider +from scrapy_redis.utils import bytes_to_str +from science_article_cssci.utils import model +from science_article_cssci.items import CssciArticleItem +from science_article_cssci.configs import cssci as config + + +class CssciArticleByIdSpider(RedisSpider): + name = "cssci_article_by_id" + custom_settings = dict( + DOWNLOADER_MIDDLEWARES={ + "science_article_cssci.middlewares.CssciCookieMiddleware": 540, + }, + ITEM_PIPELINES={ + "science_article_cssci.pipelines.BuildDetailPipeline": 300, + "science_article_cssci.pipelines.MongoPipeline": 310, + "science_article_cssci.pipelines.KafkaPipeline": 350, + }, + # LOG_LEVEL="INFO" + ) + + def make_request_from_data(self, data): + data = bytes_to_str(data) + data = json.loads(data) + third_id = data.get("third_id") + yield scrapy.FormRequest( + url=config.CSSCI_ARTICLE_DETAIL_API, method="GET", + formdata=model.get_article_detail_param(third_id=third_id), callback=self.parse_detail, + meta={"third_id": third_id}) + + def parse_detail(self, response, **kwargs): + def change_qi(tmp_str): + return re.sub(r'^0|0$', '', tmp_str) + + def change_string(tmp_str): + tmp = tmp_str.split("aaa") + tmp_z = [] + for t in tmp: + if len(t) > 1: + tmp_z.append(t) + return tmp_z + + # print(response) + meta = response.meta + third_id = meta['third_id'] + resp_json = response.json() + contents: list = resp_json.get('contents', []) + body = [c for c in contents if c.get("sno") == third_id] + if body: + content = body[0] + else: + content = {} + d = dict( + content=content, + author=resp_json.get("author"), + catation=resp_json.get("catation"), + ) + self.logger.debug(d) + article_item = CssciArticleItem(**dict(third_id=third_id, resp_raw=d)) + yield article_item diff --git a/science_article_cssci/science_article_cssci/utils/__init__.py b/science_article_cssci/science_article_cssci/utils/__init__.py new file mode 100644 index 0000000..56f0e22 --- /dev/null +++ b/science_article_cssci/science_article_cssci/utils/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/11/13 11:19 +# @Author : zhaoxiangpeng +# @File : __init__.py diff --git a/science_article_cssci/science_article_cssci/utils/model.py b/science_article_cssci/science_article_cssci/utils/model.py new file mode 100644 index 0000000..9cf1a1d --- /dev/null +++ b/science_article_cssci/science_article_cssci/utils/model.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/11/13 14:47 +# @Author : zhaoxiangpeng +# @File : model.py + +import math +from random import random +from datetime import datetime +from urllib.parse import quote + +CURRENT_YEAR = datetime.today().year + + +def new_session_key() -> int: + """ + 起初以为是服务器返回的 + 每次搜索刷新,因为加载了html,浏览器也么有判断 + var session_key=Math.floor(Math.random()*1000) + :return: + """ + return math.floor(random() * 1000) + + +def get_search_list_param( + query: str, + page: int = 1, + year: int = CURRENT_YEAR, + session_key: int = None, + **kwargs +): + """ + :param query: + :param page: + :param year: + :param session_key: 每次搜索会出一个html,在html中生成,每次翻页不会变化 + :param kwargs: + :return: + """ + param = {'control': 'search_base', 'action': 'search_lysy', + 'title': quote(query, safe='/', encoding='utf-8'), + 'xkfl1': '', + 'wzlx': '', 'qkname': '', 'type': '', 'jj': '', 'start_year': 1998, 'end_year': CURRENT_YEAR, 'nian': year, + 'juan': '', + 'qi': '', 'xw1': '', 'xw2': '', 'pagesize': 20, 'pagenow': page, 'order_type': 'nian', 'order_px': 'ASC', + 'search_tag': 1, 'session_key': session_key, 'rand': random().__str__()} + return param + + +def get_article_detail_param(third_id=None): + return dict(control='search', + action='source_id', + id=third_id, + rand=random().__str__()) diff --git a/science_article_cssci/science_article_cssci/utils/search_type_config.py b/science_article_cssci/science_article_cssci/utils/search_type_config.py new file mode 100644 index 0000000..5563437 --- /dev/null +++ b/science_article_cssci/science_article_cssci/utils/search_type_config.py @@ -0,0 +1,91 @@ +search_type = { + 15: '所有字段', + 1: '篇名(词)', + 17: '英文篇名', + # 2:'篇名(词)(精确)', + 3: '作者', + # 4:'作者(精确)', + 5: '作者(第一作者)', + # 16:"作者(第一作者+精确)", + 6: "关键词", + # 7:"关键词(精确)", + 8: '期刊名称', + # 9:'期刊名称(精确)', + 10: '作者机构', + # 11:'作者机构(第一机构)', + # 12:'作者地区', + 13: '中图类号', + 14: '基金细节' +} + +search_type_z = { + 15: '所有字段', + 1: '篇名(词)', + 17: '英文篇名', + 2: '篇名(词)(精确)', + 3: '作者', + 4: '作者(精确)', + 5: '作者(第一作者)', + 16: "作者(第一作者+精确)", + 6: "关键词", + 7: "关键词(精确)", + 8: '期刊名称', + 9: '期刊名称(精确)', + 10: '作者机构', + 11: '作者机构(第一机构)', + 12: '作者地区', + 13: '中图类号', + 18: '期刊名称(精确)', + 14: '基金细节' +} + +search_type_s = { + 15: '所有字段', + 1: '篇名(词)', + 17: '英文篇名', + # 2:'篇名(词)(精确)', + 3: '作者', + # 4:'作者(精确)', + # 5:'作者(第一作者)', + # 16:"作者(第一作者+精确)", + 6: "关键词", + # 7:"关键词(精确)", + 8: '期刊名称', + # 9:'期刊名称(精确)', + 10: '作者机构', + # 11:'作者机构(第一机构)', + 12: '作者地区', + 13: '中图类号', + 14: '基金细节' +} +search_type_ly = { + 4: '被引作者(精确)', + 3: '被引作者', + 5: '被引作者(排除自引)', + 1: '被引篇名(词)', + 2: '被引篇名(词)(精确)', + 6: '被引期刊名称', + 7: '被引期刊名称(精确)', + # 8:'期刊名称(排除自引)', + 9: '被引文献细节' +} + +search_type_ly_x = { + # 4:'被引作者(精确)', + 3: '被引作者', + 5: '被引作者(排除自引)', + 1: '被引篇名(词)', + # 2:'被引篇名(词)(精确)', + 6: '被引期刊名称', + # 7:'被引期刊名称(精确)', + # 8:'期刊名称(排除自引)', + 9: '被引文献细节' +} + +order_value = { + 'nian': "年代", + 'lypm ': "篇名(词)", + # 'nian':"被引次数", + 'bz': "作者" + # 'nian':"相关度" +}