You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
27 lines
909 B
Python
27 lines
909 B
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2026/1/13 16:08
|
|
# @Author : zhaoxiangpeng
|
|
# @File : test_more_so.py
|
|
|
|
from parsel import Selector
|
|
|
|
TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi']
|
|
|
|
|
|
def test_parser():
|
|
with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f:
|
|
data = f.read()
|
|
print(data)
|
|
selector = Selector(data)
|
|
rows = selector.xpath(r'//tr')
|
|
for row in rows[1:]:
|
|
cols = row.xpath('./td')
|
|
row_datas = []
|
|
for col in cols:
|
|
col_data = col.xpath('string(.)').get().strip()
|
|
row_datas.append(col_data)
|
|
data = dict(zip(TABLE_HEAD_EN, row_datas))
|
|
if data.get('src_db') == 'SrcDatabase-来源库':
|
|
continue
|
|
print(data)
|