From 752521c87c37b56e16a24680c39c867097f66a2d Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Thu, 12 Mar 2026 14:11:33 +0800 Subject: [PATCH] =?UTF-8?q?test:cnki=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_item_exists.py | 20 ++++++++++++++ science_article_cnki/tests/test_more_so.py | 26 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100644 science_article_cnki/tests/test_item_exists.py create mode 100644 science_article_cnki/tests/test_more_so.py diff --git a/science_article_cnki/tests/test_item_exists.py b/science_article_cnki/tests/test_item_exists.py new file mode 100644 index 0000000..006ec92 --- /dev/null +++ b/science_article_cnki/tests/test_item_exists.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/13 14:54 +# @Author : zhaoxiangpeng +# @File : test_item_exists.py + +from pymongo import MongoClient +from pymongo.database import Database +from pymongo.collection import Collection +from science_article_cnki.db_utils.mongo import MongoDBUtils +from science_article_cnki.settings import MONGO_URI, MONGO_DATABASE + +client: MongoClient = MongoClient(MONGO_URI) +db: Database = client[MONGO_DATABASE] + + +def test_item_exists(): + collection: Collection = db.get_collection('data_cnki_article') + results = collection.find_one(filter={"third_id": {"$in": ['SCJI202502004']}}, projection={"_id": 0, "third_id": 1}) + print(results) + diff --git a/science_article_cnki/tests/test_more_so.py b/science_article_cnki/tests/test_more_so.py new file mode 100644 index 0000000..7743025 --- /dev/null +++ b/science_article_cnki/tests/test_more_so.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/13 16:08 +# @Author : zhaoxiangpeng +# @File : test_more_so.py + +from parsel import Selector + +TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi'] + + +def test_parser(): + with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f: + data = f.read() + print(data) + selector = Selector(data) + rows = selector.xpath(r'//tr') + for row in rows[1:]: + cols = row.xpath('./td') + row_datas = [] + for col in cols: + col_data = col.xpath('string(.)').get().strip() + row_datas.append(col_data) + data = dict(zip(TABLE_HEAD_EN, row_datas)) + if data.get('src_db') == 'SrcDatabase-来源库': + continue + print(data)