Compare commits
No commits in common. '752521c87c37b56e16a24680c39c867097f66a2d' and '21963b7f800ee524ba4a4ad83c2f057eed45b616' have entirely different histories.
752521c87c
...
21963b7f80
@ -1,40 +0,0 @@
|
|||||||
from typing import AsyncIterator, Any
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
from science_article_cnki.models import cnki_model as model
|
|
||||||
from science_article_cnki.configs import cnki as config
|
|
||||||
|
|
||||||
|
|
||||||
class CnkiLatestIncrementSpider(scrapy.Spider):
|
|
||||||
name = "cnki_latest_increment"
|
|
||||||
custom_settings = dict(
|
|
||||||
DOWNLOADER_MIDDLEWARES={
|
|
||||||
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
|
|
||||||
},
|
|
||||||
ITEM_PIPELINES={
|
|
||||||
"science_article_cnki.pipelines.MongoPipeline": 300,
|
|
||||||
"science_article_cnki.pipelines.DupTodoPipeline": 310,
|
|
||||||
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
|
|
||||||
},
|
|
||||||
# LOG_LEVEL="INFO"
|
|
||||||
)
|
|
||||||
source = 'cnki'
|
|
||||||
resource_type: str = "学术期刊"
|
|
||||||
|
|
||||||
query_id: int
|
|
||||||
query: str
|
|
||||||
filters: list = list()
|
|
||||||
|
|
||||||
async def start(self) -> AsyncIterator[Any]:
|
|
||||||
m = dict(query=self.query, resource_type=self.resource_type, page=1)
|
|
||||||
m.update(filters=self.filters)
|
|
||||||
query_body = model.adv_refine_search(**m)
|
|
||||||
# 把筛选项加到查询体中
|
|
||||||
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
|
|
||||||
form_d = model.adv_query_search(query_body, **m)
|
|
||||||
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
|
|
||||||
formdata=form_d, meta=dict(REQUEST_Q=m))
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2026/1/13 14:54
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : test_item_exists.py
|
|
||||||
|
|
||||||
from pymongo import MongoClient
|
|
||||||
from pymongo.database import Database
|
|
||||||
from pymongo.collection import Collection
|
|
||||||
from science_article_cnki.db_utils.mongo import MongoDBUtils
|
|
||||||
from science_article_cnki.settings import MONGO_URI, MONGO_DATABASE
|
|
||||||
|
|
||||||
client: MongoClient = MongoClient(MONGO_URI)
|
|
||||||
db: Database = client[MONGO_DATABASE]
|
|
||||||
|
|
||||||
|
|
||||||
def test_item_exists():
|
|
||||||
collection: Collection = db.get_collection('data_cnki_article')
|
|
||||||
results = collection.find_one(filter={"third_id": {"$in": ['SCJI202502004']}}, projection={"_id": 0, "third_id": 1})
|
|
||||||
print(results)
|
|
||||||
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2026/1/13 16:08
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : test_more_so.py
|
|
||||||
|
|
||||||
from parsel import Selector
|
|
||||||
|
|
||||||
TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi']
|
|
||||||
|
|
||||||
|
|
||||||
def test_parser():
|
|
||||||
with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f:
|
|
||||||
data = f.read()
|
|
||||||
print(data)
|
|
||||||
selector = Selector(data)
|
|
||||||
rows = selector.xpath(r'//tr')
|
|
||||||
for row in rows[1:]:
|
|
||||||
cols = row.xpath('./td')
|
|
||||||
row_datas = []
|
|
||||||
for col in cols:
|
|
||||||
col_data = col.xpath('string(.)').get().strip()
|
|
||||||
row_datas.append(col_data)
|
|
||||||
data = dict(zip(TABLE_HEAD_EN, row_datas))
|
|
||||||
if data.get('src_db') == 'SrcDatabase-来源库':
|
|
||||||
continue
|
|
||||||
print(data)
|
|
||||||
@ -1,159 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2026/1/21 16:45
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : firld_parser.py
|
|
||||||
import json
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Dict, Callable, Any, List
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
class ScopusFieldParsing:
|
|
||||||
@staticmethod
|
|
||||||
def parse_basic_information(frame: Dict[str, dict]) -> Dict[str, Any]:
|
|
||||||
return dict(
|
|
||||||
id=frame.get("sno"),
|
|
||||||
title=frame.get("lypm"),
|
|
||||||
title_format=handle_format_str(frame.get("lypm")),
|
|
||||||
abstract=None,
|
|
||||||
url='http://cssci.nju.edu.cn/control/controllers.php?control=search&action=source_id&id=' + frame.get("sno", ''),
|
|
||||||
article_type_string=frame.get("subtypeDescription"),
|
|
||||||
doi=frame.get("prism:doi"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_date_information(frame: Dict[str, dict]) -> Dict[str, Any]:
|
|
||||||
frame = frame.get("frame")
|
|
||||||
date = frame.get("prism:coverDate")
|
|
||||||
source = frame.get("item", {}).get("bibrecord", {}).get("head", {}).get("source")
|
|
||||||
publicationdate = source.get("publicationdate")
|
|
||||||
|
|
||||||
def f():
|
|
||||||
results = [dict(
|
|
||||||
pub_year=publicationdate.get("year"),
|
|
||||||
v_month=publicationdate.get("month"),
|
|
||||||
v_day=publicationdate.get("day")
|
|
||||||
)]
|
|
||||||
return json_dumps(results, ensure_ascii=False)
|
|
||||||
|
|
||||||
return dict(
|
|
||||||
vyear=publicationdate.get("year"),
|
|
||||||
pub_date=f(),
|
|
||||||
ea_year=None,
|
|
||||||
ea_month=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_article_source_information(frame: Dict[str, dict]) -> Dict[str, Any]:
|
|
||||||
frame = frame.get("frame")
|
|
||||||
return dict(
|
|
||||||
volume=frame.get("prism:volume"),
|
|
||||||
issue=frame.get("prism:issueIdentifier"),
|
|
||||||
lang=frame.get("language").get("@xml:lang"),
|
|
||||||
pages=None,
|
|
||||||
startpage=frame.get("prism:startingPage"),
|
|
||||||
endpage=frame.get("prism:endingPage"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_source_information(frame: Dict[str, dict]) -> Dict[str, Any]:
|
|
||||||
frame = frame.get("frame")
|
|
||||||
source = frame.get("item", {}).get("bibrecord", {}).get("head", {}).get("source")
|
|
||||||
issn_list = source.get("issn")
|
|
||||||
if isinstance(issn_list, dict):
|
|
||||||
issn_list = [issn_list]
|
|
||||||
issn = None
|
|
||||||
eissn = None
|
|
||||||
for issn_obj in issn_list:
|
|
||||||
if issn_obj.get('@type') == "print":
|
|
||||||
issn = issn_obj.get("$")
|
|
||||||
elif issn_obj.get('@type') == "electronic":
|
|
||||||
eissn = issn_obj.get("$")
|
|
||||||
else:
|
|
||||||
issn = issn_obj.get("$")
|
|
||||||
|
|
||||||
return dict(
|
|
||||||
journal=frame.get("prism:publicationName"),
|
|
||||||
journal_format=handle_format_str(frame.get("prism:publicationName"), str_type="en"),
|
|
||||||
issn=FormatUtil.formatISSN(issn),
|
|
||||||
eissn=FormatUtil.formatISSN(eissn),
|
|
||||||
cn=None,
|
|
||||||
isbn=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_meeting_information(frame: pd.DataFrame = None) -> Dict[str, Any]:
|
|
||||||
return dict(
|
|
||||||
meeting_name=None,
|
|
||||||
meeting_time=None,
|
|
||||||
meeting_address=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_publish_information(frame: pd.DataFrame = None) -> Dict[str, Any]:
|
|
||||||
source = frame.get("item", {}).get("bibrecord", {}).get("head", {}).get("source")
|
|
||||||
return dict(
|
|
||||||
publisher=None,
|
|
||||||
pub_city=None,
|
|
||||||
pub_country=source.get("@country"),
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_author_information(frame: pd.DataFrame = None) -> Dict[str, Any]:
|
|
||||||
author_group: List[dict] = frame.get("item", {}).get("bibrecord", {}).get("head", {}).get("author-group", [])
|
|
||||||
orcid_list = []
|
|
||||||
for group in author_group:
|
|
||||||
affiliation: dict = group.get("affiliation", {})
|
|
||||||
author_list: List[dict] = group.get("author", [])
|
|
||||||
for author_obj in author_list:
|
|
||||||
surname = author_obj.get("ce:surname")
|
|
||||||
given_name = author_obj.get("ce:given-name")
|
|
||||||
auid = author_obj.get("@auid")
|
|
||||||
orcid = author_obj.get("@orcid")
|
|
||||||
if orcid:
|
|
||||||
orcid_list.append(orcid)
|
|
||||||
result_dict = process_author_address_relation_row(frame)
|
|
||||||
|
|
||||||
return dict(
|
|
||||||
email=None,
|
|
||||||
researcher_id=None,
|
|
||||||
orc_id='; '.join(orcid_list) if orcid_list else None,
|
|
||||||
author_order=result_dict['author_order'],
|
|
||||||
address_order=result_dict['address_order'],
|
|
||||||
relation_author_address=result_dict['relation_author_address'],
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def parse_other_information(frame: Dict[str, dict]) -> Dict[str, Any]:
|
|
||||||
authkeywords = frame.get("authkeywords", {})
|
|
||||||
auth_keywords = authkeywords.get("author-keyword", [])
|
|
||||||
keywords = json_dumps([auth_keyword.get("$") for auth_keyword in auth_keywords], ensure_ascii=False)
|
|
||||||
subject_areas = frame.get("subject-areas", {}).get("subject-area", [])
|
|
||||||
sub_areas = json_dumps([subject_area.get("$") for subject_area in subject_areas], ensure_ascii=False)
|
|
||||||
return dict(
|
|
||||||
key_words=keywords,
|
|
||||||
sub_code=sub_areas,
|
|
||||||
source_type="2",
|
|
||||||
wos_we_tag=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _parsing(self, row) -> Dict[str, Any]:
|
|
||||||
scopus_json = row.get('scopus_json')
|
|
||||||
df_dict = json.loads(scopus_json)
|
|
||||||
df = df_dict.get("abstracts-retrieval-response")
|
|
||||||
new_dict = dict()
|
|
||||||
new_dict.update(self.parse_basic_information(df))
|
|
||||||
new_dict.update(self.parse_date_information(df))
|
|
||||||
new_dict.update(self.parse_article_source_information(df))
|
|
||||||
new_dict.update(self.parse_source_information(df))
|
|
||||||
new_dict.update(self.parse_meeting_information(df))
|
|
||||||
new_dict.update(self.parse_publish_information(df))
|
|
||||||
new_dict.update(self.parse_author_information(df))
|
|
||||||
new_dict.update(self.parse_other_information(df))
|
|
||||||
new_dict.update(dict(updated_time=row.get('updated_time')))
|
|
||||||
return new_dict
|
|
||||||
|
|
||||||
def parsing(self, df: pd.DataFrame = None):
|
|
||||||
result = df[['scopus_json', 'updated_time']].apply(self._parsing, axis=1)
|
|
||||||
pdf_result = list(result.values)
|
|
||||||
return pdf_result
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
# Automatically created by: scrapy startproject
|
|
||||||
#
|
|
||||||
# For more information about the [deploy] section see:
|
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
||||||
|
|
||||||
[settings]
|
|
||||||
default = science_article_cssci.settings
|
|
||||||
|
|
||||||
[deploy]
|
|
||||||
#url = http://localhost:6800/
|
|
||||||
project = science_article_cssci
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2026/1/20 17:06
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : crawl_article_by_id.py
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
from typing import List
|
|
||||||
import redis
|
|
||||||
from twisted.internet import defer
|
|
||||||
from scrapy.crawler import CrawlerProcess
|
|
||||||
from scrapy.utils.project import get_project_settings
|
|
||||||
from science_article_cssci.spiders.cssci_article_by_id import CssciArticleByIdSpider
|
|
||||||
|
|
||||||
|
|
||||||
def push_task():
|
|
||||||
settings = get_project_settings()
|
|
||||||
r = redis.StrictRedis.from_url(settings.get("REDIS_URL"))
|
|
||||||
r.lpush(
|
|
||||||
"cssci_article_by_id:start_urls",
|
|
||||||
*[
|
|
||||||
json.dumps({'third_id': '11G0412025010007'}, ensure_ascii=False),
|
|
||||||
json.dumps({'third_id': '11C1172023010002'}, ensure_ascii=False),
|
|
||||||
json.dumps({'third_id': '11J0092023020008'}, ensure_ascii=False),
|
|
||||||
json.dumps({'third_id': '44Z0712023010003'}, ensure_ascii=False),
|
|
||||||
json.dumps({'third_id': '11D1022023010001'}, ensure_ascii=False),
|
|
||||||
json.dumps({'third_id': '22D1042023010007'}, ensure_ascii=False),
|
|
||||||
])
|
|
||||||
|
|
||||||
|
|
||||||
def starter():
|
|
||||||
process = CrawlerProcess(get_project_settings())
|
|
||||||
process.crawl(CssciArticleByIdSpider)
|
|
||||||
process.start()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
push_task()
|
|
||||||
starter()
|
|
||||||
File diff suppressed because one or more lines are too long
@ -1,32 +0,0 @@
|
|||||||
class Settings:
|
|
||||||
env = "dev"
|
|
||||||
SEARCH_ROUTE = '/api/wosnx/core/runQuerySearch'
|
|
||||||
EXPORT_ROUTE = '/api/wosnx/indic/export/saveToFile'
|
|
||||||
DB_CHANGE_ELE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science Core Collection"]'
|
|
||||||
QUERY_INPUT_ELE = '//*[@id="advancedSearchInputArea"]'
|
|
||||||
SEARCH_BUTTON_ELE = '//button[@data-ta="run-search"]/span[@class="mat-mdc-button-touch-target"]'
|
|
||||||
|
|
||||||
EXPORT_BUTTON_ELE = '//*[@id="export-trigger-btn"]'
|
|
||||||
TABWIN_BUTTON_ELE = '//*[@id="exportToTabWinButton"]' # 制表符分割文件button
|
|
||||||
|
|
||||||
RECORD_TYPE_SELECT_ELE = '//div[@class="ng-star-inserted"]/wos-select/button[@aria-haspopup="listbox"]' # 记录内容选择框
|
|
||||||
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record"]' # 完整记录
|
|
||||||
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="Full Record and Cited References"]' # 全记录与参考文献
|
|
||||||
|
|
||||||
RECORD_RANGE_ELE = '//*[@id="radio3-input"]' # 记录范围
|
|
||||||
RECORD_EXPORT_START_ELE = '//input[@name="markFrom"]'
|
|
||||||
RECORD_EXPORT_END_ELE = '//input[@name="markTo"]'
|
|
||||||
|
|
||||||
EXPORT_FILE_ELE = '//*[@id="exportButton"]'
|
|
||||||
|
|
||||||
INPUT_CONTENT = '(OG=(Anhui University of Science & Technology)) AND PY=(2025)'
|
|
||||||
|
|
||||||
|
|
||||||
class ProSettings(Settings):
|
|
||||||
DB_CHANGE = '//*[@id="global-select"]/div/div[@aria-label="Select database"]/div[@title="Web of Science 核心合集"]'
|
|
||||||
EXPORT_BUTTON_ELE = '//botton[@id="export-trigger-btn"]'
|
|
||||||
FULL_RECORD_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="完整记录"]' # 完整记录
|
|
||||||
FULL_RECORD_REFERENCE_ELE = '//div[@id="global-select"]//div[@class="options options-menu"]/div[@title="全记录与引用的参考文献"]' # 全记录与参考文献
|
|
||||||
|
|
||||||
|
|
||||||
settings = Settings()
|
|
||||||
Loading…
Reference in New Issue