|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2022/12/14 14:46
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : api.py
|
|
|
|
|
|
import data_process_tool
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import re
|
|
|
|
|
|
|
|
|
def step1():
|
|
|
"""
|
|
|
EID去重保留gratetotal更大值的行
|
|
|
"""
|
|
|
record1 = data_process_tool.read_data(root_path + 'API下载记录.csv')
|
|
|
record2 = data_process_tool.read_data(root_path + 'API失败记录重新下载采集.csv')
|
|
|
record3 = data_process_tool.read_data(root_path + '2023原始记录.csv')
|
|
|
# 失败记录 对比 api记录中缺失的列
|
|
|
api_chaji = {'归属机构', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID',
|
|
|
'原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
|
|
|
# 失败记录 对比 原始记录中缺失的列
|
|
|
raw_api_chaji = {'EID', '归属机构', 'Author full names', '文献标题', 'CODEN', '访问类型', '带归属机构的作者',
|
|
|
'来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
|
|
|
print(record2)
|
|
|
# 把 失败记录缺失的列在原始记录中补充
|
|
|
record3 = record3[list(raw_api_chaji)]
|
|
|
record3.rename(columns={'EID': 'EID_copy'}, inplace=True)
|
|
|
temp_ = pd.merge(record2, record3, how='left', left_on=['EID'], right_on=['EID_copy'])
|
|
|
# 只保留与api记录相同的列
|
|
|
record2 = temp_[record1.columns.values.tolist()]
|
|
|
# api记录与失败记录合并
|
|
|
all_api_record = pd.concat([record1, record2])
|
|
|
|
|
|
# 保留GrandTotal最大值 的EID,换个思路,把 GrandTotal 列排序,对EID去重保留最后一个
|
|
|
all_api_record.sort_values(by=['GrandTotal'], inplace=True)
|
|
|
all_api_record.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 去重后的文件
|
|
|
|
|
|
# 不为数字的列转为0
|
|
|
# 删除有问题的行
|
|
|
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
|
|
|
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
|
|
|
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
|
|
|
# all_api_record['2020'] = all_api_record['2020'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan)
|
|
|
# 对ISBN
|
|
|
group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
|
|
|
group_by.to_excel(root_path + 'eid去重grandTotal合并.xlsx')
|
|
|
|
|
|
|
|
|
def is_float(data):
|
|
|
try:
|
|
|
return float(data)
|
|
|
except Exception:
|
|
|
print(data)
|
|
|
return 0
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
|
|
|
step1()
|