Compare commits
16 Commits
f977b8ad51
...
6c0d732877
| Author | SHA1 | Date |
|---|---|---|
|
|
6c0d732877 | 3 weeks ago |
|
|
9a29a8ace7 | 1 month ago |
|
|
5d6a562cca | 1 month ago |
|
|
669c7836b6 | 1 month ago |
|
|
6d4b0a7dd9 | 1 month ago |
|
|
a86cbc9952 | 1 month ago |
|
|
fea5d948ae | 1 month ago |
|
|
5d44e5fe86 | 1 month ago |
|
|
fe7044292b | 2 months ago |
|
|
576260f52d | 2 months ago |
|
|
7cda5cc406 | 2 months ago |
|
|
e3a23ad33e | 2 months ago |
|
|
78a76ba9f2 | 2 months ago |
|
|
1c2fd3c988 | 2 months ago |
|
|
129ab6569d | 2 months ago |
|
|
ea68319ee6 | 3 months ago |
@ -0,0 +1,57 @@
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Optional, Dict, Tuple
|
||||
from pymongo import MongoClient
|
||||
from pymongo import UpdateOne
|
||||
from pymongo.errors import DuplicateKeyError, BulkWriteError
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pymongo.database import Database
|
||||
from pymongo.collection import Collection
|
||||
from pymongo.results import InsertManyResult, BulkWriteResult
|
||||
|
||||
|
||||
def update_document(filter_query: dict = None, update_data: dict = None, replace: bool = True) -> Tuple[dict, dict]:
|
||||
update_query = {}
|
||||
if not update_data:
|
||||
return {}, {}
|
||||
|
||||
for key, val in update_data.items():
|
||||
if replace:
|
||||
update_query.setdefault(
|
||||
"$set", {}
|
||||
).update(
|
||||
{key: val}
|
||||
)
|
||||
else:
|
||||
if isinstance(val, list):
|
||||
update_query.setdefault(
|
||||
"$addToSet", {}
|
||||
).update({
|
||||
key: {"$each": val}
|
||||
})
|
||||
else:
|
||||
update_query.setdefault(
|
||||
"$set", {}
|
||||
).update(
|
||||
{key: val}
|
||||
)
|
||||
return filter_query, update_query
|
||||
|
||||
|
||||
class MongoDBUtils:
|
||||
def __init__(self, mongo_uri, mongo_db):
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
self.client: MongoClient = None
|
||||
self.db: Database = None
|
||||
|
||||
def _insert2db(self, items, tablename, ordered: bool = False, **kwargs) -> InsertManyResult:
|
||||
collection: Collection = self.db.get_collection(tablename)
|
||||
result: InsertManyResult = collection.insert_many(items, ordered=ordered, **kwargs)
|
||||
return result
|
||||
|
||||
def _update2db(self, items, tablename, ordered: bool = False, **kwargs) -> BulkWriteResult:
|
||||
collection: Collection = self.db.get_collection(tablename)
|
||||
bulk_results: BulkWriteResult = collection.bulk_write(items, ordered=ordered, **kwargs)
|
||||
return bulk_results
|
||||
@ -0,0 +1,91 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/10/27 17:12
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : ackextension.py
|
||||
import logging
|
||||
import pymysql
|
||||
from scrapy import signals
|
||||
from scrapy.crawler import Crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SpiderProtocol:
|
||||
name: str
|
||||
record_id: int
|
||||
org_id: int
|
||||
org_name: str
|
||||
query_id: int
|
||||
query_content: str
|
||||
|
||||
def get_records_found(self) -> int: ...
|
||||
|
||||
|
||||
class ACKExtension:
|
||||
def __init__(self, crawler: Crawler):
|
||||
self.crawler = crawler
|
||||
self.change_state_sql = 'update task_batch_record set %(update_kws)s where %(update_cond)s'
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
ext = cls(crawler=crawler)
|
||||
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
|
||||
crawler.signals.connect(ext.spider_error, signal=signals.spider_error)
|
||||
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
|
||||
return ext
|
||||
|
||||
def spider_opened(self, spider):
|
||||
kws = {
|
||||
'is_done': 2,
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def spider_closed(self, spider: SpiderProtocol):
|
||||
"""
|
||||
# 修改任务状态
|
||||
# 通知
|
||||
"""
|
||||
kws = {
|
||||
'is_done': 1,
|
||||
'result_count': spider.get_records_found(),
|
||||
'updated_time': 'CURRENT_TIMESTAMP'
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def spider_error(self, spider: SpiderProtocol):
|
||||
kws = {
|
||||
'is_done': -1,
|
||||
'updated_time': 'CURRENT_TIMESTAMP'
|
||||
}
|
||||
sql = self.change_state_sql % {
|
||||
'update_kws': ', '.join([f'{k}={v}' for k, v in kws.items()]),
|
||||
'update_cond': 'id=%(record_id)s' % {'record_id': spider.record_id}
|
||||
}
|
||||
self._execute_sql(sql)
|
||||
|
||||
def _execute_sql(self, sql):
|
||||
settings = self.crawler.settings
|
||||
client = pymysql.connect(
|
||||
host=settings.get('MYSQL_HOST'),
|
||||
port=settings.get('MYSQL_PORT', 3306),
|
||||
database=settings.get('MYSQL_DATABASE'),
|
||||
user=settings.get('MYSQL_USER'),
|
||||
passwd=settings.get('MYSQL_PASSWORD'),
|
||||
)
|
||||
try:
|
||||
cursor = client.cursor()
|
||||
cursor.execute(sql)
|
||||
cursor.connection.commit()
|
||||
logger.info(f'Execute SQL: {sql}')
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
finally:
|
||||
client.close()
|
||||
@ -0,0 +1,36 @@
|
||||
import scrapy
|
||||
from science_article_add.items import ArticleItem, IdRelationItem, ArticleCitedItem
|
||||
|
||||
|
||||
class WosItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
third_id = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class WosArticleItem(ArticleItem):
|
||||
__tablename__ = 'data_wos_article'
|
||||
|
||||
third_id = scrapy.Field()
|
||||
"""
|
||||
wos发文item
|
||||
"""
|
||||
exported = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class WosCitedNumberItem(ArticleCitedItem):
|
||||
__tablename__ = 'relation_cited_number_wos'
|
||||
|
||||
"""发文被引量item"""
|
||||
third_id = scrapy.Field()
|
||||
cited = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class WosIdRelationItem(IdRelationItem):
|
||||
__tablename__ = 'relation_school_wos'
|
||||
|
||||
query_ids = scrapy.Field()
|
||||
school_ids = scrapy.Field()
|
||||
task_ids = scrapy.Field()
|
||||
@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/12/2 13:34
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : verify_data.py
|
||||
import logging
|
||||
from itemadapter import ItemAdapter
|
||||
from pymongo import MongoClient
|
||||
|
||||
from science_article_add.items import ArticleItem
|
||||
|
||||
|
||||
class VerifyDataIntegrity:
|
||||
def __init__(self, mongo_uri, mongo_db):
|
||||
self.successful_delete = False
|
||||
self.batch_ids = set()
|
||||
self.successful = []
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
self.client: MongoClient = None
|
||||
self.db = None
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
c = cls(
|
||||
mongo_uri=crawler.settings.get("MONGO_URI"),
|
||||
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
|
||||
)
|
||||
return c
|
||||
|
||||
def init_db(self):
|
||||
self.client = MongoClient(self.mongo_uri)
|
||||
self.db = self.client[self.mongo_db]
|
||||
|
||||
def open_spider(self, spider):
|
||||
spider_batch_ids = spider.get_batch_ids()
|
||||
for batch in spider_batch_ids:
|
||||
if batch.get("field", "UT") == "UT":
|
||||
self.batch_ids.add(batch.get("third_id"))
|
||||
self.init_db()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
if isinstance(item, ArticleItem):
|
||||
unique_id = adapter.get("third_id")
|
||||
self.successful.append(unique_id)
|
||||
if self.successful_delete:
|
||||
self.batch_ids.discard(unique_id)
|
||||
return item
|
||||
|
||||
def close_spider(self, spider):
|
||||
failure = self.batch_ids - set(self.successful)
|
||||
coll = self.db.get_collection("todo_ids_wos")
|
||||
if self.successful:
|
||||
if self.successful_delete:
|
||||
coll.delete_many(filter={"third_id": {"$in": self.successful}})
|
||||
self.logger.info("Successfully deleted %d articles", len(self.successful))
|
||||
else:
|
||||
coll.update_many(filter={"third_id": {"$in": self.successful}}, update={"$set": {"state": 1}})
|
||||
self.logger.info("Successfully updated %d articles", len(self.successful))
|
||||
if failure:
|
||||
self.logger.warning("未下载到: %s" % list(failure))
|
||||
coll.update_many(filter={"third_id": {"$in": list(failure)}}, update={"$set": {"state": -1}})
|
||||
else:
|
||||
self.logger.info("Successfully verified: %s" % "下载完整无异常")
|
||||
@ -0,0 +1,40 @@
|
||||
# pipelines.py
|
||||
import pymongo
|
||||
from itemadapter import ItemAdapter
|
||||
from science_article_add.items.wos import WosCitedNumberItem, WosIdRelationItem
|
||||
|
||||
|
||||
class MongoDBPipeline:
|
||||
def __init__(self, mongo_uri, mongo_db):
|
||||
self.mongo_uri = mongo_uri
|
||||
self.mongo_db = mongo_db
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
return cls(
|
||||
mongo_uri=crawler.settings.get('MONGO_URI'),
|
||||
mongo_db=crawler.settings.get('MONGO_DATABASE', 'scrapy_data')
|
||||
)
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.client = pymongo.MongoClient(self.mongo_uri)
|
||||
self.db = self.client[self.mongo_db]
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.client.close()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
adapter = ItemAdapter(item)
|
||||
|
||||
# 根据Item类型存储到不同的集合
|
||||
if isinstance(item, WosIdRelationItem):
|
||||
collection_name = 'relation_school_wos'
|
||||
elif isinstance(item, WosCitedNumberItem):
|
||||
collection_name = 'relation_cited_number_wos'
|
||||
else:
|
||||
collection_name = 'data_other'
|
||||
|
||||
# 插入数据
|
||||
self.db[collection_name].insert_one(dict(adapter))
|
||||
|
||||
return item
|
||||
@ -0,0 +1,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2024/3/5 16:05
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : parse_data.py
|
||||
|
||||
import logging
|
||||
from typing import Union
|
||||
from science_article_add.utils.tools import str2int
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DEFAULT_TABLE_HEAD = ['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT']
|
||||
DEFAULT_TABLE_HEAD_LOWER = [h.lower() for h in DEFAULT_TABLE_HEAD]
|
||||
|
||||
|
||||
def to_dict(data, headers: list):
|
||||
data_text = data.strip().decode()
|
||||
_to_dict = {}
|
||||
|
||||
for key, value in zip(headers, data_text.split('\t')):
|
||||
if not value:
|
||||
value = None
|
||||
_to_dict[key] = value
|
||||
|
||||
vyear = None
|
||||
str2int(_to_dict.get("py"), None)
|
||||
try:
|
||||
vyear = str2int(_to_dict.get("py"), None)
|
||||
if not vyear:
|
||||
logger.warning("WOS号: %s,年份异常: %s" % (_to_dict["ut"], _to_dict.get("py")))
|
||||
except Exception as e:
|
||||
logger.exception("""
|
||||
原始数据: %s,
|
||||
数据字典: %s
|
||||
异常信息: %s""" % (data, _to_dict, e))
|
||||
|
||||
_to_dict["py"] = vyear
|
||||
|
||||
return _to_dict
|
||||
|
||||
|
||||
def parse_full_records_txt(content: bytes):
|
||||
lines = content.strip().split(b'\r\n')
|
||||
head_line = lines.pop(0)
|
||||
try:
|
||||
head_start = head_line.index(b'PT')
|
||||
head_line = head_line[head_start:]
|
||||
head_line = head_line.strip().decode('utf-8')
|
||||
HEADERS = head_line.split('\t')
|
||||
HEADERS = [s.lower() for s in HEADERS]
|
||||
except ValueError:
|
||||
logger.error("内容出现异常跳过: %s" % head_line)
|
||||
HEADERS = ['PT', 'AU', 'Z2', 'AF', 'BA', 'BF', 'CA', 'GP', 'BE', 'TI', 'Z1', 'SO', 'Z3', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'Z5', 'ID', 'AB', 'Z4', 'C1', 'Z6', 'RP', 'EM', 'Z7', 'RI', 'OI', 'FU', 'FX', 'CR', 'NR', 'TC', 'Z9', 'Z8', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'SI', 'PN', 'SU', 'MA', 'BP', 'EP', 'AR', 'DI', 'D2', 'EA', 'EY', 'PG', 'P2', 'WC', 'SC', 'PM', 'UT', 'OA', 'HP', 'HC', 'DA', 'C3']
|
||||
HEADERS = [s.lower() for s in HEADERS]
|
||||
|
||||
while lines:
|
||||
line_data = lines.pop(0)
|
||||
# print(line_data)
|
||||
standard_data = to_dict(line_data, HEADERS)
|
||||
# third_id = standard_data.pop('ut', None)
|
||||
# if not third_id:
|
||||
# continue
|
||||
yield standard_data
|
||||
|
||||
|
||||
def parse_full_records(body: Union[bytes, str]):
|
||||
"""
|
||||
解析响应的下载内容
|
||||
"""
|
||||
if isinstance(body, str):
|
||||
body = body.encode()
|
||||
item_g = parse_full_records_txt(body)
|
||||
for data_dic in item_g:
|
||||
yield data_dic
|
||||
|
||||
|
||||
@ -0,0 +1,40 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScienceArticlCnkiItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
class AddItemBase(scrapy.Item):
|
||||
third_id = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleItem(AddItemBase):
|
||||
exported = scrapy.Field()
|
||||
|
||||
|
||||
class IdRelationItem(AddItemBase):
|
||||
query_ids = scrapy.Field()
|
||||
school_ids = scrapy.Field()
|
||||
task_ids = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleCitedItem(AddItemBase):
|
||||
cited = scrapy.Field()
|
||||
|
||||
|
||||
class CnkiCitedNumberItem(ArticleCitedItem):
|
||||
__tablename__ = 'relation_cited_number_cnki'
|
||||
|
||||
"""发文被引量item"""
|
||||
third_id = scrapy.Field()
|
||||
cited = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
@ -0,0 +1,128 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/5/13 10:41
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : enum_cls.py
|
||||
|
||||
import enum
|
||||
from datetime import timedelta
|
||||
|
||||
|
||||
class ResourceType(enum.Enum):
|
||||
"""资源类型"""
|
||||
JOURNAL = "YSTT4HG0" # 学术期刊
|
||||
DISSERTATION = "LSTPFY1C" # 学位论文
|
||||
CONFERENCE = "JUP3MUPD" # 会议
|
||||
NEWSPAPER = "MPMFIG1A" # 报纸
|
||||
ALMANAC = "HHCPM1F8"
|
||||
BOOK = "EMRPGLPA"
|
||||
PATENT = "VUDIXAIY"
|
||||
STANDARD = "WQ0UVIAA"
|
||||
ACHIEVEMENTS = "BLZOG7CK"
|
||||
|
||||
|
||||
class SearchTypeId(enum.Enum):
|
||||
"""知网的检索类型"""
|
||||
ADV = 1
|
||||
SIMPLE = 2
|
||||
AUTHOR = 3
|
||||
EXPERT = 4 # 专业检索
|
||||
SENTENCE = 5
|
||||
'''
|
||||
GROUP = 6
|
||||
PAGE = 7
|
||||
SORT = 8
|
||||
ABSTRACT = 9
|
||||
MORESENTENCE = 10
|
||||
HISTORY = 11
|
||||
SIZE = 12
|
||||
RESULT = 13
|
||||
ADVRESULT = 14
|
||||
EXPERTRESULT = 15
|
||||
AUTHORRESULT = 16
|
||||
SENRESULT = 17
|
||||
CROSSDBCHANGEDB = 18
|
||||
COMBOHISTORY = 19
|
||||
'''
|
||||
|
||||
|
||||
class SearchFromId(enum.Enum):
|
||||
SEARCH = 1
|
||||
GROUPSEARCH = 2
|
||||
RESULT = 3
|
||||
PAGE = 4
|
||||
SORT = 5
|
||||
CHANGEDB = 6
|
||||
DISPLAYMODEL = 7
|
||||
NAVISEARCH = 8
|
||||
HISTORY = 9
|
||||
COMBOHISTORY = 10
|
||||
CROSSDBCHANGEDB = 11
|
||||
CHANGELANG = 12
|
||||
GROUP = 99
|
||||
|
||||
|
||||
class SearchFieldEnum(enum.Enum):
|
||||
"""文献元数据字段枚举类"""
|
||||
SU = "主题"
|
||||
TKA = "篇关摘"
|
||||
TI = "篇名"
|
||||
KY = "关键词"
|
||||
AB = "摘要"
|
||||
CO = "小标题"
|
||||
FT = "全文"
|
||||
AU = "作者"
|
||||
FI = "第一作者"
|
||||
RP = "通讯作者"
|
||||
AF = "作者单位"
|
||||
LY = "期刊名称"
|
||||
RF = "参考文献"
|
||||
FU = "基金"
|
||||
CLC = "中图分类号"
|
||||
SN = "ISSN"
|
||||
CN = "CN"
|
||||
DOI = "DOI"
|
||||
QKLM = "栏目信息"
|
||||
FAF = "第一单位"
|
||||
CF = "被引频次"
|
||||
|
||||
|
||||
class OperatorEnum(enum.Enum):
|
||||
模糊 = "FUZZY"
|
||||
精确 = "DEFAULT"
|
||||
|
||||
|
||||
class OperatorTypeEnum(enum.Enum):
|
||||
DEFAULT = 0
|
||||
TOPRANK = 1
|
||||
FUZZY = 2
|
||||
GT = 3
|
||||
GE = 4
|
||||
LT = 5
|
||||
LE = 6
|
||||
BETWEEN = 7
|
||||
FREQUENCY = 8
|
||||
PREFIX = 9
|
||||
SUFFIX = 10
|
||||
CONTAINS = 11
|
||||
NEAR = 12
|
||||
SENTENCE = 13
|
||||
IS = 14
|
||||
FUZZYFREQUENCY = 15
|
||||
|
||||
|
||||
class LogicEnum(enum.Enum):
|
||||
AND = 0
|
||||
OR = 1
|
||||
NOT = 2
|
||||
|
||||
|
||||
class UpdatedTimeEnum(enum.Enum):
|
||||
"""
|
||||
最近一段时间的枚举
|
||||
"""
|
||||
最近一周 = timedelta(days=7)
|
||||
最近一月 = timedelta(days=30)
|
||||
最近半年 = timedelta(days=180)
|
||||
最近一年 = timedelta(days=180)
|
||||
今年迄今 = timedelta(days=180)
|
||||
|
||||
@ -0,0 +1,90 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from __future__ import annotations
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class ScienceArticlCnkiPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Tuple, Union
|
||||
|
||||
from pymongo import MongoClient
|
||||
from itemadapter import ItemAdapter
|
||||
from pymongo.errors import (
|
||||
DuplicateKeyError,
|
||||
BulkWriteError
|
||||
)
|
||||
|
||||
from science_article_cnki.db_utils.mongo import MongoDBUtils, update_document, build_update_query
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy.statscollectors import StatsCollector
|
||||
|
||||
mongo_logger = logging.getLogger('pymongo')
|
||||
mongo_logger.setLevel(logging.WARNING)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MongoPipeline(MongoDBUtils):
|
||||
def __init__(self, mongo_uri, mongo_db, stats: StatsCollector):
|
||||
super().__init__(mongo_uri, mongo_db)
|
||||
self.stats: StatsCollector = stats
|
||||
self.insert_failure_update_enable = True
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler: Crawler):
|
||||
return cls(
|
||||
mongo_uri=crawler.settings.get("MONGO_URI"),
|
||||
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
|
||||
stats=crawler.stats
|
||||
)
|
||||
|
||||
def open_spider(self, spider):
|
||||
self.client = MongoClient(self.mongo_uri)
|
||||
self.db = self.client[self.mongo_db]
|
||||
|
||||
def process_item(self, item, spider):
|
||||
# 确定Item类型
|
||||
adapter = ItemAdapter(item)
|
||||
item_type = self._get_item_type(item)
|
||||
collection = self.db.get_collection(item_type)
|
||||
d = adapter.asdict()
|
||||
try:
|
||||
insert_result = collection.insert_one(d)
|
||||
self.stats.inc_value("item2db_inserted/{}".format(item_type))
|
||||
except DuplicateKeyError as duplicate_error:
|
||||
if self.insert_failure_update_enable:
|
||||
write_error = duplicate_error.details
|
||||
key_pattern = write_error.get('keyPattern')
|
||||
key_value = write_error.get('keyValue')
|
||||
logger.debug("dupKey: %s, keyValue: %s", key_pattern, key_value)
|
||||
d.pop("_id", None)
|
||||
[d.pop(k, None) for k in key_pattern.keys()]
|
||||
up_result = collection.update_one(filter=key_value, update={"$set": d}, upsert=True)
|
||||
self.stats.inc_value("item2db_updated/{}".format(item_type))
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
return item
|
||||
|
||||
def close_spider(self, spider):
|
||||
self.client.close()
|
||||
|
||||
@staticmethod
|
||||
def _get_item_type(item) -> str:
|
||||
"""获取Item类型"""
|
||||
if hasattr(item, '__tablename__'):
|
||||
return item.__class__.__tablename__
|
||||
return 'items_null_table'
|
||||
|
||||
@ -0,0 +1,105 @@
|
||||
# Scrapy settings for science_article_cnki project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "science_article_cnki"
|
||||
|
||||
SPIDER_MODULES = ["science_article_cnki.spiders"]
|
||||
NEWSPIDER_MODULE = "science_article_cnki.spiders"
|
||||
|
||||
ADDONS = {}
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Concurrency and throttling settings
|
||||
#CONCURRENT_REQUESTS = 16
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||
DOWNLOAD_DELAY = 3
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
COOKIES_ENABLED = True
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
SEARCH_REQUEST_HEADERS = {
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
'Cookie': 'Hm_lvt_dcec09ba2227fd02c55623c1bb82776a=1739256689; UM_distinctid=197b0769b48ea3-0de0b4b2dd761f-26001051-1fa400-197b0769b49cc6; Ecp_ClientId=e250627180800765334; Ecp_ClientIp=111.186.53.36; cnkiUserKey=1b8e7dbe-3c98-864f-2b80-84b544af32af; _c_WBKFRo=UO8UFAxWLjMjlOxhuKvmtkZ4yYaXr8dPZXuhVFea; Ecp_loginuserbk=SJTU; tfstk=g5GqYEZ0ZId4NHSWG0FNzQCb6QNYs5-QjfZ_SV0gloqDDdFa7uoTCSMjSA5ZJuEOhdn6_lmxYPZ0DxMNb0nUXt99nAPZ2q5jhfuO_P0iXEE6kLgxk5FMAHTBOq3vhen9f3NMS4V_773PuGuxk5Q-60hJAqQN2mSLS5mgZz4gS540ItYPZPqliPf0SgYzWuVgSrX0ZT4_uGb0Sc0kzPEuolmgsUPu2PVgjcViG50mS_zQnU-thdfV8NPaxqqPs67Lu-cB9u5Mabzqzugc-1fiaryqZpcfbM2jI2eKGqONwSgEE74qjBx0ex0r_Jh9Csg0ZoPxa-bMXocxSfPYTNAmzSr4KbwXO1mnzVDQUbTH9SP0mANx5w-jzjojkbu1STV4GYyEgWAdmlMS8fzZ6hdrYqDnjASP1GUobXlt3GXanzUzAU8z4y3oBzrYp_6OB8VLzkTblOBTnzUzAU8PBOeu2zrBlr1..; Ecp_session=1; SID_sug=018104; knsLeftGroupSelectItem=; dsorders=CF; dsortypes=cur%20DESC; knsadv-searchtype=%7B%22BLZOG7CK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22MPMFIG1A%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22T2VC03OH%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JQIRZIYA%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22S81HNSV3%22%3A%22gradeSearch%22%2C%22YSTT4HG0%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22ML4DRIDX%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WQ0UVIAA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22VUDIXAIY%22%3A%22gradeSearch%2CmajorSearch%22%2C%22LIQN9Z3G%22%3A%22gradeSearch%22%2C%22NN3FJMUV%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22LSTPFY1C%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HHCPM1F8%22%3A%22gradeSearch%2CmajorSearch%22%2C%22OORPU5FE%22%3A%22gradeSearch%2CmajorSearch%22%2C%22WD0FTY92%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22BPBAFJ5S%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22EMRPGLPA%22%3A%22gradeSearch%2CmajorSearch%22%2C%22PWFIRAGL%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22U8J8LYLV%22%3A%22gradeSearch%2CmajorSearch%22%2C%22R79MZMCB%22%3A%22gradeSearch%22%2C%22J708GVCE%22%3A%22gradeSearch%2CmajorSearch%22%2C%228JBZLDJQ%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%22HR1YT1Z9%22%3A%22gradeSearch%2CmajorSearch%22%2C%22JUP3MUPD%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NLBO1Z6R%22%3A%22gradeSearch%2CmajorSearch%22%2C%22RMJLXHZ3%22%3A%22gradeSearch%2CmajorSearch%2CsentenceSearch%22%2C%221UR4K4HZ%22%3A%22gradeSearch%2CmajorSearch%2CauthorSearch%2CsentenceSearch%22%2C%22NB3BWEHK%22%3A%22gradeSearch%2CmajorSearch%22%2C%22XVLO76FD%22%3A%22gradeSearch%2CmajorSearch%22%7D; Ecp_IpLoginFail=25121149.65.252.186; SID_kns_new=kns018106; SID_restapi=kns018110; KNS2COOKIE=1765437722.656.114388.232155|b25e41a932fd162af3b8c5cff4059fc3; dblang=both; createtime-advInput=2025-12-11%2015%3A22%3A21; searchTimeFlags=1',
|
||||
'Origin': 'https://kns.cnki.net',
|
||||
'Referer': 'https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0%2CLSTPFY1C%2CJUP3MUPD%2CMPMFIG1A%2CWQ0UVIAA%2CBLZOG7CK%2CPWFIRAGL%2CEMRPGLPA%2CNLBO1Z6R%2CNN3FJMUV',
|
||||
'User-Agent': USER_AGENT,
|
||||
}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "science_article_cnki.middlewares.ScienceArticlCnkiSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
RETRY_ENABLED = True
|
||||
RETRY_TIMES = 2 # 重试3次
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
|
||||
DOWNLOADER_MIDDLEWARES = {
|
||||
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
|
||||
# "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
|
||||
}
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# "science_article_cnki.middlewares.ScienceArticlCnkiDownloaderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "science_article_cnki.pipelines.ScienceArticlCnkiPipeline": 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
|
||||
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
|
||||
MONGO_DATABASE = 'science2'
|
||||
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@ -0,0 +1,10 @@
|
||||
import scrapy
|
||||
|
||||
|
||||
class ExampleSpider(scrapy.Spider):
|
||||
name = "example"
|
||||
allowed_domains = ["example.com"]
|
||||
start_urls = ["https://example.com"]
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
@ -0,0 +1,17 @@
|
||||
from typing import List, Tuple
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def str2int(val, replace=0):
|
||||
try:
|
||||
val = int(val)
|
||||
except ValueError:
|
||||
val = replace
|
||||
except TypeError:
|
||||
val = replace
|
||||
return val
|
||||
|
||||
|
||||
def get_today_date(fmt: str = "%Y-%m-%d"):
|
||||
return datetime.today().strftime(fmt)
|
||||
|
||||
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = science_article_cnki.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = science_article_cnki
|
||||
Loading…
Reference in New Issue