data_process/bcr/bcr记录保留多列.py

# -*- coding: utf-8 -*-
# @Time    : 2023/1/17 14:06
# @Author  : zhaoxiangpeng
# @File    : bcr记录保留多列.py

import data_process_tool
import pandas as pd


def func1():
    """
    包留DOI, 来源出版物，来源出版物缩写
    """
    record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
    # 不为数字的列转为0
    # 删除有问题的行
    all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']]
    all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
    all_api_record['2020'].fillna(0, inplace=True)  # 把空行换为0
    all_api_record['2020'] = all_api_record['2020'].astype(float)  # 类型转为float
    group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
    group_by.reset_index(inplace=True)
    # group_by.to_csv('.....csv')  # 如果需要保存...
    # 取需要保留的列并去重只保留一个
    keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)

    table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    print(table)
    # table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False)
    table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False)


def func2():
    """
    将ISBN列分割为单个
    """
    record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
    # 保留需要的列
    all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']]

    ISBNs = all_api_record['ISBN'].str.split('; ', expand=True)
    ISBNs = ISBNs.stack()  # 把行转成列
    ISBNs = ISBNs.reset_index(level=1, drop=True)  # 重置索引, 并删除多余的索引
    ISBNs.name = 'ISBN'
    all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs)
    # 也要处理一下有问题的哪行
    all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
    all_api_record['2020'].fillna(0, inplace=True)  # 把空行换为0
    all_api_record['2020'] = all_api_record['2020'].astype(float)  # 类型转为float

    # 分组
    group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
    group_by.reset_index(inplace=True)

    keep_columns = all_api_record[['DOI', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)

    table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    print(table)
    # table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False)
    table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False)


if __name__ == '__main__':
    root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
    func1()
    func2()