You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

27 lines
909 B
Python

# -*- coding: utf-8 -*-
# @Time : 2026/1/13 16:08
# @Author : zhaoxiangpeng
# @File : test_more_so.py
from parsel import Selector
TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi']
def test_parser():
with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f:
data = f.read()
print(data)
selector = Selector(data)
rows = selector.xpath(r'//tr')
for row in rows[1:]:
cols = row.xpath('./td')
row_datas = []
for col in cols:
col_data = col.xpath('string(.)').get().strip()
row_datas.append(col_data)
data = dict(zip(TABLE_HEAD_EN, row_datas))
if data.get('src_db') == 'SrcDatabase-来源库':
continue
print(data)