data_process/bcr/BCR_20240724.py

# -*- coding: utf-8 -*-
# @Time    : 2024/7/24 20:03
# @Author  : zhaoxiangpeng
# @File    : BCR_20240724.py

import os
import pandas as pd

from bcr.utils import read_file, str2float, str2int
from bcr.BCR_20240201 import step2, step3, step4
from bcr.BCR_20240201 import main, ROOT_PATH
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH

c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID',
      'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total']


def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True):
    """
    ppt第二个需求修改
    """
    # 2024/12/25 14:58 修改，增加了reduce_columns参数用来替换固定值
    if reduce_columns is None:
        reduce_columns = ['2021', '2022', '2023', 'Grand Total']
    if keep_columns is None:
        keep_columns = c2
    # 处理数值类型
    for col in reduce_columns:
        table[col] = table[col].apply(str2float)
    # 正常聚合
    # 1.求和结果
    agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum()
    agg_result.reset_index(inplace=True)  # 重置索引
    # 2.分块
    filter_table_is = table[table["文献类型"] == "Book"]
    filter_table_not = table[table["文献类型"] != "Book"]
    # 3.分别去重
    filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    # 4.合并去重保留是Book的，book的在上面，重复项保留上面的
    merge_table = pd.concat([filter_table_is, filter_table_not])
    merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    # 5.删除多于列
    merge_table.drop(reduce_columns, axis=1, inplace=True)
    # 重新匹配
    result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN'])
    result_table = result[keep_columns]
    result['年份'] = result['年份'].astype(str)
    result['Sort Year'] = result['Sort Year'].astype(str)
    """
    # 新增的需求
    # 以ISBN聚合，重复项保留
    big_table = pd.DataFrame()
    group_by = table.groupby(by=['ISBN'])
    for _, group in group_by:
        agg: pd.Series = group[reduce_columns].sum()
        group_filter = group[group["文献类型"] == "Book"]
        if group_filter.empty:
            first = group[:1]
            # total求和
        else:
            first = group_filter[:1]
        # 替换聚合的值
        first[reduce_columns] = agg
        big_table = pd.concat([big_table, first])

    group_by.reset_index(inplace=True)
    """
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)

    return result_table


def main_change():
    table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
    # 测试
    # table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t')

    t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
    table = pd.concat([table, t2])
    table.drop_duplicates(subset=['EID'], keep='last', inplace=True)

    # 把数量统计标准化
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    step2_table = step2_change(table, export=True)
    # step3_table, no_data_table = step3(step2_table, export=True)
    # step4(no_data_table)


def change_field_type():
    table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl')
    table['年份'] = table['年份'].apply(str2int)
    table['Sort Year'] = table['Sort Year'].apply(str2int)
    table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False)


if __name__ == '__main__':
    main_change()
    # change_field_type()