# -*- coding: utf-8 -*- # @Time : 2024/7/24 20:03 # @Author : zhaoxiangpeng # @File : BCR_20240724.py import os import pandas as pd from bcr.utils import read_file, str2float, str2int from bcr.BCR_20240201 import step2, step3, step4 from bcr.BCR_20240201 import main, ROOT_PATH from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID', 'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total'] def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True): """ ppt第二个需求修改 """ # 2024/12/25 14:58 修改,增加了reduce_columns参数用来替换固定值 if reduce_columns is None: reduce_columns = ['2021', '2022', '2023', 'Grand Total'] if keep_columns is None: keep_columns = c2 # 处理数值类型 for col in reduce_columns: table[col] = table[col].apply(str2float) # 正常聚合 # 1.求和结果 agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum() agg_result.reset_index(inplace=True) # 重置索引 # 2.分块 filter_table_is = table[table["文献类型"] == "Book"] filter_table_not = table[table["文献类型"] != "Book"] # 3.分别去重 filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) # 4.合并去重保留是Book的,book的在上面,重复项保留上面的 merge_table = pd.concat([filter_table_is, filter_table_not]) merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) # 5.删除多于列 merge_table.drop(reduce_columns, axis=1, inplace=True) # 重新匹配 result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN']) result_table = result[keep_columns] result['年份'] = result['年份'].astype(str) result['Sort Year'] = result['Sort Year'].astype(str) """ # 新增的需求 # 以ISBN聚合,重复项保留 big_table = pd.DataFrame() group_by = table.groupby(by=['ISBN']) for _, group in group_by: agg: pd.Series = group[reduce_columns].sum() group_filter = group[group["文献类型"] == "Book"] if group_filter.empty: first = group[:1] # total求和 else: first = group_filter[:1] # 替换聚合的值 first[reduce_columns] = agg big_table = pd.concat([big_table, first]) group_by.reset_index(inplace=True) """ if export: result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False) return result_table def main_change(): table = read_file(os.path.join(ROOT_PATH, 'MergeFile')) # 测试 # table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t') t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t') table = pd.concat([table, t2]) table.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 把数量统计标准化 table['2021'] = table['2021'].apply(str2float) table['2022'] = table['2022'].apply(str2float) table['2023'] = table['2023'].apply(str2float) table['Grand Total'] = table['Grand Total'].apply(str2float) step2_table = step2_change(table, export=True) # step3_table, no_data_table = step3(step2_table, export=True) # step4(no_data_table) def change_field_type(): table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl') table['年份'] = table['年份'].apply(str2int) table['Sort Year'] = table['Sort Year'].apply(str2int) table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False) if __name__ == '__main__': main_change() # change_field_type()