# -*- coding: utf-8 -*- # @Time : 2022/12/14 14:46 # @Author : zhaoxiangpeng # @File : api.py import data_process_tool import pandas as pd import numpy as np import re def step1(): """ EID去重保留gratetotal更大值的行 """ record1 = data_process_tool.read_data(root_path + 'API下载记录.csv') record2 = data_process_tool.read_data(root_path + 'API失败记录重新下载采集.csv') record3 = data_process_tool.read_data(root_path + '2023原始记录.csv') # 失败记录 对比 api记录中缺失的列 api_chaji = {'归属机构', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'} # 失败记录 对比 原始记录中缺失的列 raw_api_chaji = {'EID', '归属机构', 'Author full names', '文献标题', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'} print(record2) # 把 失败记录缺失的列在原始记录中补充 record3 = record3[list(raw_api_chaji)] record3.rename(columns={'EID': 'EID_copy'}, inplace=True) temp_ = pd.merge(record2, record3, how='left', left_on=['EID'], right_on=['EID_copy']) # 只保留与api记录相同的列 record2 = temp_[record1.columns.values.tolist()] # api记录与失败记录合并 all_api_record = pd.concat([record1, record2]) # 保留GrandTotal最大值 的EID,换个思路,把 GrandTotal 列排序,对EID去重保留最后一个 all_api_record.sort_values(by=['GrandTotal'], inplace=True) all_api_record.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 去重后的文件 # 不为数字的列转为0 # 删除有问题的行 all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float # all_api_record['2020'] = all_api_record['2020'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan) # 对ISBN group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() group_by.to_excel(root_path + 'eid去重grandTotal合并.xlsx') def is_float(data): try: return float(data) except Exception: print(data) return 0 if __name__ == '__main__': root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' step1()