diff --git a/science_article_cnki/tests/test_item_exists.py b/science_article_cnki/tests/test_item_exists.py new file mode 100644 index 0000000..006ec92 --- /dev/null +++ b/science_article_cnki/tests/test_item_exists.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/13 14:54 +# @Author : zhaoxiangpeng +# @File : test_item_exists.py + +from pymongo import MongoClient +from pymongo.database import Database +from pymongo.collection import Collection +from science_article_cnki.db_utils.mongo import MongoDBUtils +from science_article_cnki.settings import MONGO_URI, MONGO_DATABASE + +client: MongoClient = MongoClient(MONGO_URI) +db: Database = client[MONGO_DATABASE] + + +def test_item_exists(): + collection: Collection = db.get_collection('data_cnki_article') + results = collection.find_one(filter={"third_id": {"$in": ['SCJI202502004']}}, projection={"_id": 0, "third_id": 1}) + print(results) + diff --git a/science_article_cnki/tests/test_more_so.py b/science_article_cnki/tests/test_more_so.py new file mode 100644 index 0000000..7743025 --- /dev/null +++ b/science_article_cnki/tests/test_more_so.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/13 16:08 +# @Author : zhaoxiangpeng +# @File : test_more_so.py + +from parsel import Selector + +TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi'] + + +def test_parser(): + with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f: + data = f.read() + print(data) + selector = Selector(data) + rows = selector.xpath(r'//tr') + for row in rows[1:]: + cols = row.xpath('./td') + row_datas = [] + for col in cols: + col_data = col.xpath('string(.)').get().strip() + row_datas.append(col_data) + data = dict(zip(TABLE_HEAD_EN, row_datas)) + if data.get('src_db') == 'SrcDatabase-来源库': + continue + print(data)