# -*- coding: utf-8 -*- # @Time : 2026/1/13 16:08 # @Author : zhaoxiangpeng # @File : test_more_so.py from parsel import Selector TABLE_HEAD_EN = ['src_db', 'title', 'author', 'org', 'journal', 'keyword', 'abstract', 'pub_time', 'first_duty', 'fund', 'year', 'volum', 'issue', 'page', 'classification_code', 'issn', 'url', 'doi'] def test_parser(): with open('Y:\cnki-metadata\CNKI-20260112161602991.xls', encoding='utf-8') as f: data = f.read() print(data) selector = Selector(data) rows = selector.xpath(r'//tr') for row in rows[1:]: cols = row.xpath('./td') row_datas = [] for col in cols: col_data = col.xpath('string(.)').get().strip() row_datas.append(col_data) data = dict(zip(TABLE_HEAD_EN, row_datas)) if data.get('src_db') == 'SrcDatabase-来源库': continue print(data)