You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
2.6 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2022/12/14 14:46
# @Author : zhaoxiangpeng
# @File : api.py
import data_process_tool
import pandas as pd
import numpy as np
import re
def step1():
"""
EID去重保留gratetotal更大值的行
"""
record1 = data_process_tool.read_data(root_path + 'API下载记录.csv')
record2 = data_process_tool.read_data(root_path + 'API失败记录重新下载采集.csv')
record3 = data_process_tool.read_data(root_path + '2023原始记录.csv')
# 失败记录 对比 api记录中缺失的列
api_chaji = {'归属机构', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID',
'原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
# 失败记录 对比 原始记录中缺失的列
raw_api_chaji = {'EID', '归属机构', 'Author full names', '文献标题', 'CODEN', '访问类型', '带归属机构的作者',
'来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
print(record2)
# 把 失败记录缺失的列在原始记录中补充
record3 = record3[list(raw_api_chaji)]
record3.rename(columns={'EID': 'EID_copy'}, inplace=True)
temp_ = pd.merge(record2, record3, how='left', left_on=['EID'], right_on=['EID_copy'])
# 只保留与api记录相同的列
record2 = temp_[record1.columns.values.tolist()]
# api记录与失败记录合并
all_api_record = pd.concat([record1, record2])
# 保留GrandTotal最大值 的EID换个思路把 GrandTotal 列排序对EID去重保留最后一个
all_api_record.sort_values(by=['GrandTotal'], inplace=True)
all_api_record.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 去重后的文件
# 不为数字的列转为0
# 删除有问题的行
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
# all_api_record['2020'] = all_api_record['2020'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan)
# 对ISBN
group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
group_by.to_excel(root_path + 'eid去重grandTotal合并.xlsx')
def is_float(data):
try:
return float(data)
except Exception:
print(data)
return 0
if __name__ == '__main__':
root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
step1()