You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
70 lines
3.1 KiB
Python
70 lines
3.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2023/1/17 14:06
|
|
# @Author : zhaoxiangpeng
|
|
# @File : bcr记录保留多列.py
|
|
|
|
import data_process_tool
|
|
import pandas as pd
|
|
|
|
|
|
def func1():
|
|
"""
|
|
包留DOI, 来源出版物,来源出版物缩写
|
|
"""
|
|
record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
|
|
# 不为数字的列转为0
|
|
# 删除有问题的行
|
|
all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']]
|
|
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
|
|
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
|
|
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
|
|
group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
|
|
group_by.reset_index(inplace=True)
|
|
# group_by.to_csv('.....csv') # 如果需要保存...
|
|
# 取需要保留的列并去重只保留一个
|
|
keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']]
|
|
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
|
|
|
|
table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
|
|
print(table)
|
|
# table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False)
|
|
table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False)
|
|
|
|
|
|
def func2():
|
|
"""
|
|
将ISBN列分割为单个
|
|
"""
|
|
record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
|
|
# 保留需要的列
|
|
all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']]
|
|
|
|
ISBNs = all_api_record['ISBN'].str.split('; ', expand=True)
|
|
ISBNs = ISBNs.stack() # 把行转成列
|
|
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
|
|
ISBNs.name = 'ISBN'
|
|
all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs)
|
|
# 也要处理一下有问题的哪行
|
|
all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
|
|
all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0
|
|
all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float
|
|
|
|
# 分组
|
|
group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
|
|
group_by.reset_index(inplace=True)
|
|
|
|
keep_columns = all_api_record[['DOI', 'ISBN']]
|
|
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
|
|
|
|
table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
|
|
print(table)
|
|
# table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False)
|
|
table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
|
|
func1()
|
|
func2()
|
|
|