# -*- coding: utf-8 -*- # @Time : 2023/1/17 14:06 # @Author : zhaoxiangpeng # @File : bcr记录保留多列.py import data_process_tool import pandas as pd def func1(): """ 包留DOI, 来源出版物,来源出版物缩写 """ record1 = data_process_tool.read_data(root_path + 'eid去重.csv') # 不为数字的列转为0 # 删除有问题的行 all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']] all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() group_by.reset_index(inplace=True) # group_by.to_csv('.....csv') # 如果需要保存... # 取需要保留的列并去重只保留一个 keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']] keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) print(table) # table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False) table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False) def func2(): """ 将ISBN列分割为单个 """ record1 = data_process_tool.read_data(root_path + 'eid去重.csv') # 保留需要的列 all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']] ISBNs = all_api_record['ISBN'].str.split('; ', expand=True) ISBNs = ISBNs.stack() # 把行转成列 ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 ISBNs.name = 'ISBN' all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs) # 也要处理一下有问题的哪行 all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float # 分组 group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() group_by.reset_index(inplace=True) keep_columns = all_api_record[['DOI', 'ISBN']] keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) print(table) # table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False) table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False) if __name__ == '__main__': root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' func1() func2()