You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

70 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2023/1/17 14:06
# @Author : zhaoxiangpeng
# @File : bcr记录保留多列.py
import data_process_tool
import pandas as pd
def func1():
"""
包留DOI, 来源出版物,来源出版物缩写
"""
record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
# 不为数字的列转为0
# 删除有问题的行
all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']]
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
group_by.reset_index(inplace=True)
# group_by.to_csv('.....csv') # 如果需要保存...
# 取需要保留的列并去重只保留一个
keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']]
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
print(table)
# table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False)
table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False)
def func2():
"""
将ISBN列分割为单个
"""
record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
# 保留需要的列
all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']]
ISBNs = all_api_record['ISBN'].str.split('; ', expand=True)
ISBNs = ISBNs.stack() # 把行转成列
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
ISBNs.name = 'ISBN'
all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs)
# 也要处理一下有问题的哪行
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
# 分组
group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
group_by.reset_index(inplace=True)
keep_columns = all_api_record[['DOI', 'ISBN']]
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
print(table)
# table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False)
table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False)
if __name__ == '__main__':
root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
func1()
func2()