# -*- coding: utf-8 -*- # @Time : 2025/1/8 14:28 # @Author : zhaoxiangpeng # @File : 20250108_func.py # 1.引文表构建 # 1.1.解析发文表的引文字段 # 1.2.下载有doi的引文的完整记录 # 1.3.对有doi的引文进行完整记录匹配 # 1.4.没有doi的发文要保留解析的其他信息 (CR字段扩展.csv) # 1.5.需要保留发文的唯一id用来匹配 (标准doi扩展完整字段.csv) # 2.发文表构建 # 2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者 # 2.2.发文信息的结果与引文信息的结果进行匹配 import os import pandas as pd from pymongo import MongoClient READ_PATH = 'Y:\\wos-metadata\\SCHOOL\\bnu' ROOT_PATH = 'Y:\数据采集需求\陶思琪20211203' DOI_SPLIT_SYMBOL = 'DOI ' DOI_SPLIT_SYMBOL_LENGTH = len(DOI_SPLIT_SYMBOL) INFO_SPLIT_SYMBOL = ', ' INFO_SPLIT_SYMBOL_LENGTH = len(INFO_SPLIT_SYMBOL) # 引文表需要有一个UT字段用来作为主键 REF_RECORD_TABLE_FIELD = ['UT', 'doi'] # 完整记录要保留的字段, {'exported.%s' % key.lower() : 1 for key in FULL_RECORD_TABLE_FIELD} FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'PT', 'UT', 'AB', 'SN', 'EI', 'BN', 'PY'] TABLE_HEAD_TRANS = {k.lower(): k for k in FULL_RECORD_TABLE_FIELD} MONGODB_REMOTE_CONFIG = dict( host='101.43.239.105', port=27017, db='science2', username='science-dev', password='kcidea1509!%25)(', url='mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true' ) client: MongoClient = MongoClient(MONGODB_REMOTE_CONFIG.get("url")) database = client.get_database(MONGODB_REMOTE_CONFIG.get('db')) def find_doi_data_from_mongo(doi_list: list): collection = database.get_collection('data_wos_article') find_results = collection.find( filter={"exported.di": {"$in": doi_list}}, projection={"_id": 0, "third_id": 1, 'exported.ab': 1, 'exported.bn': 1, 'exported.di': 1, 'exported.ei': 1, 'exported.is': 1, 'exported.pt': 1, 'exported.sn': 1, 'exported.so': 1} ).collation({"locale": "en", "strength": 2}) # 忽略大小写 for document in find_results: exported: dict = document.get('exported') third_id = document.get('third_id') exported.setdefault('ut', third_id) yield exported def step_2_2(base_table: pd.DataFrame, ref_table: pd.DataFrame): """ 2.2.发文信息的结果与引文信息的结果进行匹配 """ out_table = base_table.merge(right=ref_table, how='left', left_on=['wos'], right_on=['third_id']) return out_table def verify_doi(text) -> str: doi_idx = text.find(DOI_SPLIT_SYMBOL) if doi_idx != -1: doi_str = text[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:] else: doi_str = text if doi_str.endswith(')'): doi_str = doi_str[:-1] return doi_str def ref_str2dic(text): ref_list = text.split('; ') for ref in ref_list: # print(ref) """ """ # 解析引文字段的信息 ref_copy = ref var1 = [] for _ in range(3): idx_t = ref_copy.find(INFO_SPLIT_SYMBOL) var1.append(ref_copy[:idx_t]) ref_copy = ref_copy[idx_t + INFO_SPLIT_SYMBOL_LENGTH:] au, py, so = var1 doi_idx = ref.find(DOI_SPLIT_SYMBOL) model = dict(au=au, py=py, so=so) if doi_idx != -1: doi_text = ref[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:] if doi_text.startswith('['): doi_mutil_text = doi_text[1:-1] doi_list = doi_mutil_text.split(', ') for doi_str in doi_list: if doi := verify_doi(doi_str): obj = dict(doi=doi) obj.update(**model) yield obj else: if doi := verify_doi(doi_text): obj = dict(doi=doi) obj.update(**model) yield obj else: obj = dict(doi=None) obj.update(**model) yield obj def step_1_1(): """ 1.1.解析发文表的引文字段 """ school_list = os.listdir(READ_PATH) for ff in school_list: if ff == 'doi展开' or ff == '待下载DOI': continue clear = [] school_path = os.path.join(READ_PATH, ff) file_list = os.listdir(school_path) for file in file_list: f = os.path.join(school_path, file) table = pd.read_csv(f, sep='\t', error_bad_lines=False) table = table[['UT', "CR"]] values = table.values.tolist() for value in values: third_id, article_reference_text = value if pd.isna(article_reference_text): continue _g = ref_str2dic(article_reference_text) for r_doi in _g: r_doi.setdefault('third_id', third_id) clear.append(r_doi) table2 = pd.DataFrame(clear) table2.to_csv(os.path.join(os.path.join('Y:\数据采集需求\陶思琪20211203', 'doi展开'), '%s.csv' % os.path.basename(school_path)), index=False) def step_1_2(): """ 1.2.下载有doi的引文的完整记录 """ BATCH_DOWNLOAD_LIMIT = 500 doi_table = pd.read_csv(os.path.join(ROOT_PATH, '待下载DOI.csv')) extend_field_list = [] tasks = doi_table['DO'].values.tolist() for i in range(0, len(tasks), BATCH_DOWNLOAD_LIMIT): batch_tasks = tasks[i: i + BATCH_DOWNLOAD_LIMIT] g = find_doi_data_from_mongo(doi_list=batch_tasks) for doc in g: extend_field_list.append(doc) extend_table = pd.DataFrame(extend_field_list) extend_table.to_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t', index=False) def step_1_3(): """ 对引文的doi补充完整记录 1.3.对有doi的引文进行完整记录匹配 1.4.没有doi的发文要保留解析的其他信息 """ doi_expand = pd.read_csv(os.path.join(ROOT_PATH, 'doi展开.csv')) doi_full_record = pd.read_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t') doi_full_record.rename(columns=TABLE_HEAD_TRANS, inplace=True) # 匹配前统一转小写,防止有匹配不上的 doi_expand['doi_backup'] = doi_expand['doi'].str.lower() doi_full_record['DI'] = doi_full_record['DI'].str.lower() # 用doi匹配到完整字段 # doi_expand[doi_expand['third_id']=='WOS:000426769900009'] ref_table = doi_expand.merge(right=doi_full_record, how='left', left_on=['doi_backup'], right_on=['DI']) # 表头重命名为规定的名字 ref_table.rename( columns={'au': '作者', 'py': '年份', 'so': '刊名-简称', 'SO': 'citedTitle', 'EI': 'online issn', 'BN': 'isbn'}, inplace=True ) ref_table['citedWork'] = None ref_table['hot'] = None # 保留需要使用的列 ref_table = ref_table[ ['third_id', '作者', '年份', '刊名-简称', 'doi', 'citedTitle', 'citedWork', 'hot', 'PT', 'UT', 'AB', 'SN', 'online issn', 'isbn']] ref_table.to_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t', index=False) def step_2_1(table: pd.DataFrame, ai_result_table): """ 处理主表的数据 2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者 """ # table = pd.read_excel(os.path.join(ROOT_PATH, '原始数据合并\法学.xlsx'), sheet_name=0) # 改原始数据的列名 table = table[['UT', 'TC', 'PY', 'IS', 'VL', 'CR']] # 字段有疑问 # 匹配识别到的学者和学校 au_table = table.merge(right=ai_result_table, how='left', left_on=['UT'], right_on=['文献编号']) # 补充列 au_table['uid'] = None au_table['docid'] = None au_table['articleId'] = None au_table['uid'] = None # 修改基础表名 au_table.rename( columns={'UT': 'wos', '本校的作者': 'citedAuthor', 'TC': 'timesCited', 'PY': 'year', 'IS': 'page', 'VL': 'volume', 'CR': '参考文献'}, inplace=True ) # 保留需要的列 au_table = au_table[['wos', 'uid', 'docid', 'articleId', '所属学校', 'citedAuthor', 'timesCited', 'year', 'page', 'volume', '参考文献']] return au_table def main_step1(): # step_1_1() # step_1_2() step_1_3() # WOS:000426769900009 def main_step2(): ai_result_table = pd.read_csv(os.path.join(ROOT_PATH, '学校地址信息1.8.csv')) reference_table = pd.read_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t') # 获取到所有的学科表 subject_table_path = os.path.join(ROOT_PATH, '原始数据合并') # subject_table_list = os.listdir(subject_table_path) subject_table_list = ['法学.xlsx'] for subject_table in subject_table_list: print('当前处理 %s' % subject_table) table = pd.read_excel(os.path.join(subject_table_path, subject_table), sheet_name=0) table = step_2_1(table, ai_result_table) full_table = step_2_2(table, ref_table=reference_table) print(full_table) full_table.drop(columns=['third_id'], inplace=True) full_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\%s' % subject_table), index=False) if __name__ == '__main__': # main_step1() main_step2()