cssci spider

main
zhaoxiangpeng 1 week ago
parent 3e3d237388
commit 21963b7f80

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/19 16:51
# @Author : zhaoxiangpeng
# @File : cssci.py
# 数据来源名
SOURCE_NAME = 'cssci'
# api配置
CSSCI_CONTROL_API = 'http://cssci.nju.edu.cn/control/controllers.php'
# 搜索列表的接口
CSSCI_SEARCH_API = CSSCI_CONTROL_API
# 详情页接口
CSSCI_ARTICLE_DETAIL_API = CSSCI_CONTROL_API
# 请求headers配置
POST_HEADERS_CONFIG = {
'content-type': 'application/x-www-form-urlencoded',
'host': 'cssci.nju.edu.cn',
'origin': 'http://cssci.nju.edu.cn',
'referer': 'http://cssci.nju.edu.cn/index.html',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
# 任务条件
TASK_CONDITION_CONFIG = dict(
task_table='task_search_strategy',
task_field=["id", "content", "param"],
task_condition="source_type = 8",
db_type="mysql",
batch_limit=1
)

@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/16 14:11
# @Author : zhaoxiangpeng
# @File : kafka.py
import time
import logging
from aiokafka import AIOKafkaProducer
from kafka.errors import KafkaError
logger = logging.getLogger(__name__)
class KafkaUtil:
MAX_RETRIES = 3
BASE_DELAY = 2 # 秒:指数退避基础
def __init__(self, producer):
self._kafka_producer: AIOKafkaProducer = producer
self.logger = logging.getLogger(__name__)
async def producer_send(self, data, topic):
try:
future = await self._kafka_producer.send(topic, value=data)
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
future.add_errback(self.on_send_error_factory(data, topic))
except KafkaError as e:
self.logger.error(f'{data.get("id", "")} - 生产失败\n'
f'失败原因:{e}')
def on_send_success(self, record_metadata, data):
msg_id = data.get("id", "unknown_id")
self.logger.info(
f"{msg_id} - 成功发送到: "
f"topic={record_metadata.topic} - partition={record_metadata.partition} - offset={record_metadata.offset}")
def on_send_error_factory(self, data, topic, retries=0):
msg_id = data.get("id", "unknown_id")
def on_send_error(exc):
self.logger.info(f"{msg_id} - 第 {retries + 1} 次发送失败: {exc}")
if retries < self.MAX_RETRIES:
delay = self.BASE_DELAY * (2 ** retries)
self.logger.info(f"{msg_id} - {delay}s 后重试,第 {retries + 1}/{self.MAX_RETRIES}")
time.sleep(delay)
future = self._kafka_producer.send(topic, value=data)
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
future.add_errback(self.on_send_error_factory(data, topic, retries + 1))
else:
self.logger.error(f"{msg_id} - 超过最大重试次数, 请检查")
return on_send_error

@ -0,0 +1,92 @@
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Optional, Dict, Tuple
from pymongo import MongoClient
from pymongo import UpdateOne
from pymongo.errors import DuplicateKeyError, BulkWriteError
if TYPE_CHECKING:
from pymongo.database import Database
from pymongo.collection import Collection
from pymongo.results import InsertOneResult, InsertManyResult, BulkWriteResult
def build_update_query(update_data: dict, replace: bool = True) -> dict:
"""
如果replace为True则直接覆盖原有的document
"""
update_query = {}
if not update_data:
return {}
for key, val in update_data.items():
if replace:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
else:
if isinstance(val, list):
update_query.setdefault(
"$addToSet", {}
).update({
key: {"$each": val}
})
else:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
return update_query
def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]:
update_query = {}
if not update_data:
return {}, {}
for key, val in update_data.items():
if replace:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
else:
if isinstance(val, list):
update_query.setdefault(
"$addToSet", {}
).update({
key: {"$each": val}
})
else:
update_query.setdefault(
"$set", {}
).update(
{key: val}
)
return filter_query, update_query
class MongoDBUtils:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client: MongoClient = None
self.db: Database = None
def insert2db(self, items, tablename, **kwargs) -> InsertOneResult:
collection: Collection = self.db.get_collection(tablename)
result: InsertOneResult = collection.insert_one(items, **kwargs)
return result
def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult:
collection: Collection = self.db.get_collection(tablename)
result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs)
return result
def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult:
collection: Collection = self.db.get_collection(tablename)
bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs)
return bulk_results

@ -0,0 +1,34 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScienceArticleCssciItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class AddItemBase(scrapy.Item):
third_id = scrapy.Field()
updated_at = scrapy.Field()
class ArticleItem(AddItemBase):
exported = scrapy.Field()
class ArticleCitedItem(AddItemBase):
cited = scrapy.Field()
class CssciArticleItem(ArticleItem):
__tablename__ = 'data_cssci_article'
third_id = scrapy.Field()
resp_raw = scrapy.Field()
detailed = scrapy.Field()
updated_at = scrapy.Field()

@ -0,0 +1,143 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import logging
from typing import Optional
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.http.headers import Headers
from science_article_cssci.scripts.get_cookie import GetSessionID
logger = logging.getLogger(__name__)
class ScienceArticleCssciSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
async def process_start(self, start):
# Called with an async iterator over the spider start() method or the
# maching method of an earlier spider middleware.
async for item_or_request in start:
yield item_or_request
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ScienceArticleCssciDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class CssciCookieMiddleware:
ss: Optional[GetSessionID]
def __init__(self, custom_headers: dict, cookie_cfg: dict):
self.custom_headers = custom_headers
self.headers = Headers(self.custom_headers)
self.cookies_pool_config = cookie_cfg
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
post_headers = crawler.settings.getdict('POST_HEADERS_CONFIG')
s = cls(
custom_headers=post_headers,
cookie_cfg=dict(
redis_uri=settings.get("COOKIE_POOL_CONFIG"),
pool_key=settings.get("COOKIE_POOL_REDIS_KEY"),
ttl=settings.get("COOKIE_REDIS_TTL")
)
)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def spider_opened(self, spider):
self.ss = GetSessionID(**self.cookies_pool_config)
async def process_request(self, request, spider):
cookie_1 = await self.ss.get_cookie_from_redis()
if not cookie_1:
cookie_1 = await self.ss.get_cookie_to_redis()
logger.info("""
没有可用cookie
重新获取: %s""" % cookie_1)
request.cookies = cookie_1
request.headers = self.headers

@ -0,0 +1,275 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from __future__ import annotations
import re
import json
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Tuple, Union, Optional
import scrapy
from itemadapter import ItemAdapter
from kafka import KafkaProducer
from pymongo import MongoClient
from pymongo.errors import (
DuplicateKeyError,
BulkWriteError
)
from science_article_cssci.db_utils.mongo import MongoDBUtils, build_update_query
from science_article_cssci.db_utils.kafka import KafkaUtil
from science_article_cssci.scripts.field_assembly import FieldAssembly
if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.statscollectors import StatsCollector
from pymongo.collection import Collection
mongo_logger = logging.getLogger('pymongo')
mongo_logger.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
class ScienceArticleCssciPipeline:
def process_item(self, item, spider):
return item
class MongoPipeline(MongoDBUtils):
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
super().__init__(mongo_uri, mongo_db)
self.stats: StatsCollector = stats
self.insert_failure_update_enable = True
self.duplicate_cover_enable = False # 重复项覆盖
@classmethod
def from_crawler(cls, crawler: Crawler):
m = cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
stats=crawler.stats
)
return m
def open_spider(self, spider):
self.client = MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider) -> scrapy.Item:
"""
插入遇到错误不处理
"""
adapter = ItemAdapter(item)
tablename = self._get_item_table(item)
collection = self.db.get_collection(tablename)
d = adapter.asdict()
try:
collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(tablename))
except DuplicateKeyError as duplicate_error:
self.stats.inc_value("item2db_duplicate/{}".format(tablename))
self.stats.inc_value(f"item_dropped_reasons_count/duplicate")
except Exception:
raise
return item
def process_item_update(self, item, spider) -> scrapy.Item:
"""
插入遇到错误进行更新
"""
adapter = ItemAdapter(item)
tablename = self._get_item_table(item)
collection = self.db.get_collection(tablename)
d = adapter.asdict()
try:
collection.insert_one(d)
self.stats.inc_value("item2db_inserted/{}".format(tablename))
except DuplicateKeyError as duplicate_error:
if self.insert_failure_update_enable:
write_error = duplicate_error.details
filter_query, update_query = self._pick_filter_update(write_error, doc=d)
updated_at_query = None # 删除不确定因素的时间防止影响更新更新除了task_id外的字段不需要处理这个
key_pattern = write_error.get('keyPattern')
key_value = write_error.get('keyValue')
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
# 专门用来适配增量的任务
task_ids = update_query.pop("task_ids", None)
if task_ids:
# task_id一定会引起变动所以先处理
task_id_query = {'task_ids': task_ids}
collection.update_one(filter=filter_query, update=build_update_query(task_id_query, replace=False))
updated_at_query = {"updated_at": update_query.pop('updated_at', None)}
update_q = build_update_query(update_query, replace=self.duplicate_cover_enable)
up_result = collection.update_one(filter=key_value, update=update_q, upsert=True)
if up_result.matched_count == up_result.modified_count == 1:
# 变动了就要修改更新的时间(其实没变也要更新,这样可以知道什么时候动过这条数据)
if updated_at_query:
collection.update_one(filter=key_value, update={"$set": updated_at_query})
self.stats.inc_value("item2db_updated/{}".format(tablename))
except Exception:
raise
return item
@staticmethod
def _pick_filter_update(write_error, doc: dict = None):
original_doc = write_error.get('op', doc) # 插入的数据
key_pattern = write_error.get('keyPattern')
original_doc.pop("_id", None) # 删掉插入失败产生的_id
filter_query = {}
update_query = {key: val for key, val in original_doc.items() if val}
for key in key_pattern.keys():
filter_query.update({key: update_query.pop(key, None)})
return filter_query, update_query
def close_spider(self, spider):
self.client.close()
@staticmethod
def _get_item_table(item) -> str:
"""获取Item类型"""
if hasattr(item, '__tablename__'):
return item.__class__.__tablename__
return 'items_null_table'
class Mongo2SciencePipeline(MongoPipeline):
def process_item(self, item, spider):
d = self.parse2science(item)
table = self._get_item_table(item)
coll = self.db.get_collection(table)
return d
@staticmethod
def parse2science(item) -> dict:
def change_qi(tmp_str):
return re.sub(r'^0|0$', '', tmp_str)
def change_string(tmp_str):
tmp = tmp_str.split("aaa")
tmp_z = []
for t in tmp:
if len(t) > 1:
tmp_z.append(t)
return tmp_z
adapter = ItemAdapter(item)
third_id = adapter['third_id']
resp_raw = adapter['resp_raw']
resp_raw = json.loads(resp_raw)
authors: list = resp_raw['author']
authors = [dict(zzmc=au_info['zzmc'], jgmc=au_info['jgmc'], bmmc=au_info['bmmc']) for au_info in
authors or []] # 作者.学校.院系
# pprint(authors)
catations: list = resp_raw.get('catation')
contents: list = resp_raw.get('contents', [])
body = [c for c in contents if c.get("sno") == adapter['third_id']]
if body:
content = body[0]
else:
content = {}
d = dict(
sno=third_id,
lypm=content['lypm'], # 篇名
blpm=content['blpm'], # 英文篇名
zzjg=authors, # 作者和机构
wzlx=content['wzlx'], # 文献类型
xkfl1=content['xkfl1'], # 学科类别1
xkfl2=content.get('xkfl2', ''), # 学科类别2
xkdm1=content['xkdm1'], # 中图类号1
xkdm2=content.get('xkdm2', ''), # 中图类号2
xmlb=content.get('xmlb', ''), # 基金项目
qkmc=content.get('qkmc', ''), # 来源期刊
# (nian)年(juan)卷第(qi).replace(/^0|0$/g,'')期:(ym)
nian=content.get('nian', ''), # 年
juan=content.get('juan', ''), # 卷
qi=content.get('qi', ''), # 期
ym=content.get('ym', ''), # 页码
byc=change_string(content.get('byc', '')), # 关键词
ckwx=catations # 参考文献
)
return d
class BuildDetailPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
item['detailed'] = self.build_detailed(adapter)
return item
@staticmethod
def build_detailed(item):
resp_raw = item.get("resp_raw")
dd = dict(
**FieldAssembly.parse_detail(
content=resp_raw.get('content'),
author=resp_raw.get('author'),
catation=resp_raw.get('catation'),
)
)
return dd
class KafkaPipeline:
def __init__(self, kafka_servers, topic):
self.kafka_servers = kafka_servers
self.topic = topic
self.producer: KafkaProducer = None
@classmethod
def from_crawler(cls, crawler):
return cls(
kafka_servers=crawler.settings.get('KAFKA_SERVERS', 'localhost:9092'),
topic=crawler.settings.get('KAFKA_TOPIC', 'scrapy_items')
)
def open_spider(self, spider):
self.producer: KafkaProducer = KafkaProducer(
bootstrap_servers=self.kafka_servers,
value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
acks='all',
linger_ms=50, # 等待时间,最多等待 50ms 封包发送
compression_type='gzip',
)
spider.logger.info(f"Connected to Kafka at {self.kafka_servers}")
def close_spider(self, spider):
if self.producer:
self.producer.flush()
self.producer.close()
spider.logger.info("Kafka connection closed")
def process_item(self, item, spider):
adapter = ItemAdapter(item)
d = adapter.asdict()
d = self.build2kafka(d)
# 发送到Kafka
future = self.producer.send(
topic=self.topic,
value=d,
headers=[{'source_type': b'cssci'}]
)
future.add_callback(self.on_send_success)
future.add_callback(self.on_send_success)
return item
def on_send_success(self, record_metadata):
"""发送成功回调"""
pass
def on_send_error(self, excp):
"""发送失败回调"""
pass
def build2kafka(self, item: dict) -> dict:
dd = dict(
id=item.get("third_id"),
**item.get('detailed')
)
return dd

@ -0,0 +1,62 @@
[source_type]
1 = 论文
2 = 综述
3 = 评论
5 = 报告
4 = 传记资料
9 = 其他
[source_jj]
1 = 国家自科基金
2 = 国家社科基金
3 = 国家级其它基金
4 = 教育部基金
5 = 其他部委级基金
6 = 中科院基金
7 = 社科院基金
8 = 省(市)级基金
9 = 其它基金
[yw_type]
1 = 期刊论文
11 = 电子文献
10 = 法规
9 = 标准
8 = 报告
7 = 汇编
6 = 信件
5 = 学位论文
4 = 会议文献
3 = 报纸
2 = 图书
99 = 其他
[source_xk]
630 = 管理学
850 = 民族学
860 = 新闻学与传播学
870 = 图书馆、情报与文献学
880 = 教育学
890 = 体育学
910 = 统计学
920 = 心理学
930 = 社会科学总论
940 = 军事学
950 = 文化学
960 = 人文、经济地理
970 = 环境科学
840 = 社会学
820 = 法学
710 = 马克思主义
720 = 哲学
730 = 宗教学
740 = 语言学
009 = 文学
751 = 外国文学
752 = 中国文学
760 = 艺术学
770 = 历史学
780 = 考古学
790 = 经济学
810 = 政治学
999 = 其他学科

@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
# @Time : 2022/1/12 8:36
# @Author : ZhaoXiangPeng
# @File : data_export.py
import os
import re
import time
from typing import List
from configparser import ConfigParser, NoSectionError
cfg_parse = ConfigParser()
try:
cfg_parse.read(os.path.join(os.path.dirname(__file__), 'config.ini'), encoding='utf-8')
print(cfg_parse)
except NoSectionError:
pass
else:
source_type = cfg_parse['source_type']
source_jj = cfg_parse['source_jj']
yw_type = cfg_parse['yw_type']
source_xk = cfg_parse['source_xk']
def ckwx(wxs: list):
wx_list = []
if wxs is None:
return wx_list
for i in range(wxs.__len__()):
content = ''
content += str(i+1) + ' '
r1: dict = wxs[i]
if r1['ywlx'] == '1': # 期刊论文
tmp = r1['ywcc'].replace(r1['ywnd']+'', '').replace(r1['ywnd']+'.', '').replace(r1['ywnd']+'','').replace(r1['ywnd']+',','')
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
content += f',{tmp}'
elif r1['ywlx'] == '3': # 报纸
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}'
elif r1['ywlx'] == '11': # 电子文献,网站
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
elif r1['ywlx'] == '99': # 其它
content += r1['ywpm']
elif r1['ywlx'] == '7':
content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
if len(r1["ywcc"]):
content += '.'+r1["ywcc"]
else:
content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else ''
content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else ''
content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else ''
content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else ''
content += f':{r1["ywym"]}' if r1['ywym'].__len__() else ''
else:
content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else ''
content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else ''
content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else ''
content += f':{r1["ywym"]}' if len(r1["ywym"]) else ''
wx_list.append(content)
return wx_list
def process_ref_field(row: dict, to_qikan=False) -> List[dict]:
row_list = []
ckwx_list = ckwx(row.get('ckwx', []))
for ckwx_one in ckwx_list:
new_row = {}
new_row['rid'] = None
new_row['reference'] = ckwx_one
new_row['aid'] = row.get('sno')
new_row['url'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno')
row_list.append(new_row)
return row_list
def process_field(row: dict, to_qikan=False):
def jtrim(x):
return re.sub(r'^(\s|\u00A0)+|(\s|\u00A0)+$', '', x or '')
def zzjg(jgs: list):
zzjg_list = []
for i in range(jgs.__len__()):
r1: dict = jgs[i]
vals: list = [str(i+1)] + list(r1.values())
zzjg_list.append('.'.join(vals))
return zzjg_list
def vpzz(jgs: list):
zz_list = []
for i in range(jgs.__len__()):
r1: dict = jgs[i]
zz_list.append(r1['zzmc']+f'[{i+1}]')
return zz_list
def vpjg(jgs: list):
jg_list = []
for i in range(jgs.__len__()):
r1: dict = jgs[i]
jg_list.append(f'[{i+1}]'+r1['jgmc']+r1['bmmc'])
return jg_list
def wxlx(t: str):
return source_type.get(t)
def xkfl(t: str):
return source_xk.get(t)
def change_qi(tmp_str):
return re.sub(r'^0|0$', '', tmp_str)
def ndjq(n, j, q, ym):
val = ''
if len(n):
val += f'{n}'
if len(j):
val += f'{j}'
if len(q):
q = change_qi(q)
val += f'{q}'
if len(ym):
val += f'{ym}'
return val
new_row = {}
if to_qikan is False:
new_row['篇名'] = row.get('lypm')
new_row['英文篇名'] = row.get('blpm')
new_row['作者及机构'] = '; '.join(zzjg(row.get('zzjg', [])))
new_row['文献类型'] = wxlx(row.get('wzlx', ''))
fl1 = xkfl(row.get('xkfl1', ''))
fl2 = xkfl(row.get('xkfl2', ''))
new_row['学科类别'] = (fl1 if fl1 is not None else '') + ('/' + fl2 if fl2 is not None else '')
lh1 = row.get('xkdm1', '')
lh2 = row.get('xkdm2', '')
new_row['中图类号'] = (lh1 if len(lh1) else '') + ('/' + lh2 if len(lh2) else '')
new_row['基金项目'] = row.get('xmlb')
new_row['来源期刊'] = row.get('qkmc')
new_row['年代卷期'] = ndjq(row.get('nian'), row.get('juan'), row.get('qi'), row.get('ym'))
new_row['关键词'] = '/'.join(row.get('byc', []))
new_row['参考文献'] = '; '.join(ckwx(row.get('ckwx', [])))
else:
new_row['序号'] = 1
new_row['题名'] = row.get('lypm')
jgzz = row.get('zzjg', [])
new_row['作者'] = ';'.join(vpzz(jgzz))
new_row['机构'] = ';'.join(vpjg(jgzz))
new_row['基金'] = row.get('xmlb')
new_row['刊名'] = row.get('qkmc')
new_row[''] = row.get('nian')
new_row[''] = row.get('juan')
new_row[''] = change_qi(row.get('qi'))
new_row['ISSN号'] = None
new_row['CN号'] = None
new_row['页码'] = str(row.get('ym'))
new_row['关键词'] = ';'.join(row.get('byc', []))
lh1 = row.get('xkdm1', '')
lh2 = row.get('xkdm2', '')
new_row['分类号'] = (lh1 if len(lh1) else '') + (';' + lh2 if len(lh2) else '')
new_row['文摘'] = None
new_row['网址'] = 'http://qikan.cqvip.com/Qikan/Article/Detail?id='+row.get('sno')
new_row['aid'] = row.get('sno')
new_row['FileName'] = None
new_row['引文'] = '; '.join(ckwx(row.get('ckwx', [])))
new_row['被引量'] = None
return new_row

@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
# @Time : 2026/1/19 17:22
# @Author : zhaoxiangpeng
# @File : field_assembly.py
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import List
def strip_symbol(text, symbol='aaa') -> str:
return text.strip(symbol)
def join_list(texts: list, symbol='/'):
return symbol.join(texts)
def spilt_text(text, symbol='aaa') -> List[str]:
r = []
if not text:
return r
text = strip_symbol(text)
texts = text.split(symbol)
for t in texts:
t = strip_symbol(t)
if t:
r.append(t)
return r
def author_org(datas: list[dict]) -> list:
r = []
for data in datas:
sno = data.get('sno')
au_name = data.get('zzmc')
org_name = data.get('jgmc')
o = [org_name]
xueyuan = data.get('bmmc')
if xueyuan:
o.append(xueyuan)
r.append(f'[{au_name}]'+join_list(o, '.'))
return r
def org(datas: list[dict]) -> list:
r = []
for data in datas:
org_name = data.get('jgmc')
r.append(org_name)
return r
class ArticleAbs:
def to_dict(self):
pass
class FieldAssembly:
@staticmethod
def parse_author():
return
@staticmethod
def parse_reference(refs: List[dict]):
ref_list = []
for i, r1 in enumerate(refs):
content = ''
content += str(i + 1) + '.'
if r1['ywlx'] == '1': # 期刊论文
tmp = r1['ywcc'].replace(r1['ywnd'] + '', '').replace(r1['ywnd'] + '.', '').replace(r1['ywnd'] + '',
'').replace(
r1['ywnd'] + ',', '')
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
content += f',{tmp}'
elif r1['ywlx'] == '3': # 报纸
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywcc"]}' if len(r1['ywcc']) else f'.{r1["ywnd"]}'
elif r1['ywlx'] == '11': # 电子文献,网站
content += f'{r1["ywzz"]}.{r1["ywpm"]}.{r1["ywqk"]}'
content += f'.{r1["ywnd"]}' if len(r1['ywnd']) > 2 else ''
elif r1['ywlx'] == '99': # 其它
content += r1['ywpm']
elif r1['ywlx'] == '7':
content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
if len(r1["ywcc"]):
content += '.' + r1["ywcc"]
else:
content += f':{r1["ywqk"]}' if r1['ywqk'].__len__() else ''
content += f'.{r1["ywcbd"]}' if r1['ywcbd'].__len__() else ''
content += f':{r1["ywcbs"]}' if r1['ywcbs'].__len__() else ''
content += f',{r1["ywqk"]}' if r1['ywqk'].__len__() > 2 else ''
content += f':{r1["ywym"]}' if r1['ywym'].__len__() else ''
else:
content += f'{r1["ywzz"]}' if len(r1["ywzz"]) else ''
content += f'.{r1["ywpm"]}' if len(r1["ywpm"]) else ''
content += f'.{r1["ywcbd"]}' if len(r1["ywcbd"]) else ''
content += f':{r1["ywcbs"]}' if len(r1["ywcbs"]) else ''
content += f',{r1["ywnd"]}' if len(r1["ywnd"]) > 2 else ''
content += f':{r1["ywym"]}' if len(r1["ywym"]) else ''
ref_list.append(content)
return ref_list
@staticmethod
def parse_detail(content: dict, author: List[dict], catation: List[dict]) -> dict:
d = dict()
d['a_id'] = content.get('id')
d['title'] = content.get('lypm')
d['title_en'] = content.get('blpm')
author_list = spilt_text(content.get('authors', ''), symbol='aaa')
d['author'] = join_list(author_list, symbol='/')
d['fund'] = content.get('xmlb')
d['source'] = content.get('qkmc')
org_list = org(author)
d['first_org'] = org_list and org_list[0]
d['org'] = join_list(author_org(author), symbol='/')
d['first_author'] = author_list and author_list[0]
d['classification_code'] = content.get('xkdm1')
year = content.get('nian')
d['year'] = year
d['volume'] = content.get('juan')
d['issue'] = content.get('qi')
d['page'] = content.get('ym')
keyword_list = spilt_text(content.get('byc'), symbol=';')
d['keywords'] = join_list(keyword_list, symbol='/')
d['fund'] = content.get('xmlb')
d['fund_type'] = content.get('jjlb')
d['references'] = FieldAssembly.parse_reference(catation)
return d
def parsing(self):
pass

@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/12 16:10
# @Author : zhaoxiangpeng
# @File : get_cookie.py
# 用来获取session_id
import asyncio
import json
from random import random
import redis
import aiohttp
class GetSessionID:
__redis_cli = None
def __init__(self, redis_uri: str, pool_key: str, ttl: int = None, **kwargs):
self.redis_uri = redis_uri
self.pool_key = pool_key
self.ttl = ttl
@staticmethod
async def new_session_id() -> dict:
session = aiohttp.ClientSession()
resp = await session.get(
'http://cssci.nju.edu.cn/index.html',
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'host': 'cssci.nju.edu.cn'
}
)
assert resp.status == 200
resp = await session.post(
'http://cssci.nju.edu.cn/control/controllers.php',
headers={
'content-type': 'application/x-www-form-urlencoded',
'host': 'cssci.nju.edu.cn',
'origin': 'http://cssci.nju.edu.cn',
'referer': 'http://cssci.nju.edu.cn/index.html',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
},
data=dict(control='user_control', action='check_user_online', rand=random()),
)
assert resp.status == 200
# 把cookiejar转为dict并返回
cookie_obj = dict()
cookie_jar = session.cookie_jar.filter_cookies(resp.url)
for key, jar in cookie_jar.items():
cookie_obj.setdefault(jar.key, jar.value)
# 关闭session
await session.close()
return cookie_obj
@property
def redis_cli(self):
if self.__class__.__redis_cli is None:
self.__class__.__redis_cli = redis.asyncio.Redis.from_url(
self.redis_uri,
decode_responses=True
)
return self.__class__.__redis_cli
async def set_cookie_to_redis(self, val):
result = await self.redis_cli.setex(self.pool_key, time=self.ttl, value=val)
return result
async def get_cookie_from_redis(self, to_dict=True):
"""
:param to_dict: 是否从字符串转为dict
:return:
"""
cookie_str = await self.redis_cli.get(self.pool_key)
if not cookie_str:
return cookie_str
if to_dict:
return json.loads(cookie_str)
return cookie_str
async def get_cookie_to_redis(self) -> dict:
"""
直接获取cookie并塞到redis
:return:
"""
cookie_obj = await self.new_session_id()
await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
return cookie_obj
async def test(self):
from loguru import logger
cookie_obj = await self.new_session_id()
logger.info("""
cookie: %s""" % cookie_obj)
res = await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
logger.info("""
插入: %s""" % res)
res = await self.get_cookie_from_redis()
logger.info("""
获取: %s""" % res)
def main(self):
asyncio.run(self.test())
if __name__ == '__main__':
GetSessionID().main()

@ -0,0 +1,110 @@
# Scrapy settings for science_article_cssci project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "science_article_cssci"
SPIDER_MODULES = ["science_article_cssci.spiders"]
NEWSPIDER_MODULE = "science_article_cssci.spiders"
ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
POST_HEADERS_CONFIG = {
# 'content-type': 'application/x-www-form-urlencoded',
'host': 'cssci.nju.edu.cn',
'origin': 'http://cssci.nju.edu.cn',
'referer': 'http://cssci.nju.edu.cn/index.html',
'user-agent': USER_AGENT
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "science_article_cssci.middlewares.ScienceArticleCssciSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "science_article_cssci.middlewares.ScienceArticleCssciDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "science_article_cssci.pipelines.ScienceArticleCssciPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
MONGO_DATABASE = 'science2'
MONGO_URI_SCIENCE = "mongodb://root:kcidea1509!%25)(@43.140.203.187:27017/"
MONGO_DATABASE_SCIENCE = 'science'
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
# cookie redis 配置
COOKIE_POOL_CONFIG = REDIS_URL
COOKIE_POOL_REDIS_KEY = 'cookies_pool:cssci:session'
COOKIE_REDIS_TTL = 60 * 60 * 6
KAFKA_SERVERS = ['hadoop01:9092', 'hadoop02:9092', 'hadoop03:9092']
KAFKA_TOPIC = "test2kafka" #

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,63 @@
import re
import json
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy_redis.utils import bytes_to_str
from science_article_cssci.utils import model
from science_article_cssci.items import CssciArticleItem
from science_article_cssci.configs import cssci as config
class CssciArticleByIdSpider(RedisSpider):
name = "cssci_article_by_id"
custom_settings = dict(
DOWNLOADER_MIDDLEWARES={
"science_article_cssci.middlewares.CssciCookieMiddleware": 540,
},
ITEM_PIPELINES={
"science_article_cssci.pipelines.BuildDetailPipeline": 300,
"science_article_cssci.pipelines.MongoPipeline": 310,
"science_article_cssci.pipelines.KafkaPipeline": 350,
},
# LOG_LEVEL="INFO"
)
def make_request_from_data(self, data):
data = bytes_to_str(data)
data = json.loads(data)
third_id = data.get("third_id")
yield scrapy.FormRequest(
url=config.CSSCI_ARTICLE_DETAIL_API, method="GET",
formdata=model.get_article_detail_param(third_id=third_id), callback=self.parse_detail,
meta={"third_id": third_id})
def parse_detail(self, response, **kwargs):
def change_qi(tmp_str):
return re.sub(r'^0|0$', '', tmp_str)
def change_string(tmp_str):
tmp = tmp_str.split("aaa")
tmp_z = []
for t in tmp:
if len(t) > 1:
tmp_z.append(t)
return tmp_z
# print(response)
meta = response.meta
third_id = meta['third_id']
resp_json = response.json()
contents: list = resp_json.get('contents', [])
body = [c for c in contents if c.get("sno") == third_id]
if body:
content = body[0]
else:
content = {}
d = dict(
content=content,
author=resp_json.get("author"),
catation=resp_json.get("catation"),
)
self.logger.debug(d)
article_item = CssciArticleItem(**dict(third_id=third_id, resp_raw=d))
yield article_item

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/13 11:19
# @Author : zhaoxiangpeng
# @File : __init__.py

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# @Time : 2024/11/13 14:47
# @Author : zhaoxiangpeng
# @File : model.py
import math
from random import random
from datetime import datetime
from urllib.parse import quote
CURRENT_YEAR = datetime.today().year
def new_session_key() -> int:
"""
起初以为是服务器返回的
每次搜索刷新因为加载了html浏览器也么有判断
var session_key=Math.floor(Math.random()*1000)
:return:
"""
return math.floor(random() * 1000)
def get_search_list_param(
query: str,
page: int = 1,
year: int = CURRENT_YEAR,
session_key: int = None,
**kwargs
):
"""
:param query:
:param page:
:param year:
:param session_key: 每次搜索会出一个html在html中生成每次翻页不会变化
:param kwargs:
:return:
"""
param = {'control': 'search_base', 'action': 'search_lysy',
'title': quote(query, safe='/', encoding='utf-8'),
'xkfl1': '',
'wzlx': '', 'qkname': '', 'type': '', 'jj': '', 'start_year': 1998, 'end_year': CURRENT_YEAR, 'nian': year,
'juan': '',
'qi': '', 'xw1': '', 'xw2': '', 'pagesize': 20, 'pagenow': page, 'order_type': 'nian', 'order_px': 'ASC',
'search_tag': 1, 'session_key': session_key, 'rand': random().__str__()}
return param
def get_article_detail_param(third_id=None):
return dict(control='search',
action='source_id',
id=third_id,
rand=random().__str__())

@ -0,0 +1,91 @@
search_type = {
15: '所有字段',
1: '篇名(词)',
17: '英文篇名',
# 2:'篇名(词)(精确)',
3: '作者',
# 4:'作者(精确)',
5: '作者(第一作者)',
# 16:"作者(第一作者+精确)",
6: "关键词",
# 7:"关键词(精确)",
8: '期刊名称',
# 9:'期刊名称(精确)',
10: '作者机构',
# 11:'作者机构(第一机构)',
# 12:'作者地区',
13: '中图类号',
14: '基金细节'
}
search_type_z = {
15: '所有字段',
1: '篇名(词)',
17: '英文篇名',
2: '篇名(词)(精确)',
3: '作者',
4: '作者(精确)',
5: '作者(第一作者)',
16: "作者(第一作者+精确)",
6: "关键词",
7: "关键词(精确)",
8: '期刊名称',
9: '期刊名称(精确)',
10: '作者机构',
11: '作者机构(第一机构)',
12: '作者地区',
13: '中图类号',
18: '期刊名称(精确)',
14: '基金细节'
}
search_type_s = {
15: '所有字段',
1: '篇名(词)',
17: '英文篇名',
# 2:'篇名(词)(精确)',
3: '作者',
# 4:'作者(精确)',
# 5:'作者(第一作者)',
# 16:"作者(第一作者+精确)",
6: "关键词",
# 7:"关键词(精确)",
8: '期刊名称',
# 9:'期刊名称(精确)',
10: '作者机构',
# 11:'作者机构(第一机构)',
12: '作者地区',
13: '中图类号',
14: '基金细节'
}
search_type_ly = {
4: '被引作者(精确)',
3: '被引作者',
5: '被引作者(排除自引)',
1: '被引篇名(词)',
2: '被引篇名(词)(精确)',
6: '被引期刊名称',
7: '被引期刊名称(精确)',
# 8:'期刊名称(排除自引)',
9: '被引文献细节'
}
search_type_ly_x = {
# 4:'被引作者(精确)',
3: '被引作者',
5: '被引作者(排除自引)',
1: '被引篇名(词)',
# 2:'被引篇名(词)(精确)',
6: '被引期刊名称',
# 7:'被引期刊名称(精确)',
# 8:'期刊名称(排除自引)',
9: '被引文献细节'
}
order_value = {
'nian': "年代",
'lypm ': "篇名(词)",
# 'nian':"被引次数",
'bz': "作者"
# 'nian':"相关度"
}
Loading…
Cancel
Save