You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

104 lines
3.9 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2024/7/24 20:03
# @Author : zhaoxiangpeng
# @File : BCR_20240724.py
import os
import pandas as pd
from bcr.utils import read_file, str2float, str2int
from bcr.BCR_20240201 import step2, step3, step4
from bcr.BCR_20240201 import main, ROOT_PATH
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID',
'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total']
def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True):
"""
ppt第二个需求修改
"""
# 2024/12/25 14:58 修改增加了reduce_columns参数用来替换固定值
if reduce_columns is None:
reduce_columns = ['2021', '2022', '2023', 'Grand Total']
if keep_columns is None:
keep_columns = c2
# 处理数值类型
for col in reduce_columns:
table[col] = table[col].apply(str2float)
# 正常聚合
# 1.求和结果
agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum()
agg_result.reset_index(inplace=True) # 重置索引
# 2.分块
filter_table_is = table[table["文献类型"] == "Book"]
filter_table_not = table[table["文献类型"] != "Book"]
# 3.分别去重
filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
# 4.合并去重保留是Book的book的在上面重复项保留上面的
merge_table = pd.concat([filter_table_is, filter_table_not])
merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
# 5.删除多于列
merge_table.drop(reduce_columns, axis=1, inplace=True)
# 重新匹配
result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN'])
result_table = result[keep_columns]
result['年份'] = result['年份'].astype(str)
result['Sort Year'] = result['Sort Year'].astype(str)
"""
# 新增的需求
# 以ISBN聚合重复项保留
big_table = pd.DataFrame()
group_by = table.groupby(by=['ISBN'])
for _, group in group_by:
agg: pd.Series = group[reduce_columns].sum()
group_filter = group[group["文献类型"] == "Book"]
if group_filter.empty:
first = group[:1]
# total求和
else:
first = group_filter[:1]
# 替换聚合的值
first[reduce_columns] = agg
big_table = pd.concat([big_table, first])
group_by.reset_index(inplace=True)
"""
if export:
result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
return result_table
def main_change():
table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
# 测试
# table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t')
t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
table = pd.concat([table, t2])
table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
# 把数量统计标准化
table['2021'] = table['2021'].apply(str2float)
table['2022'] = table['2022'].apply(str2float)
table['2023'] = table['2023'].apply(str2float)
table['Grand Total'] = table['Grand Total'].apply(str2float)
step2_table = step2_change(table, export=True)
# step3_table, no_data_table = step3(step2_table, export=True)
# step4(no_data_table)
def change_field_type():
table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl')
table['年份'] = table['年份'].apply(str2int)
table['Sort Year'] = table['Sort Year'].apply(str2int)
table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False)
if __name__ == '__main__':
main_change()
# change_field_type()