|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2024/7/24 20:03
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : BCR_20240724.py
|
|
|
|
|
|
import os
|
|
|
import pandas as pd
|
|
|
|
|
|
from bcr.utils import read_file, str2float, str2int
|
|
|
from bcr.BCR_20240201 import step2, step3, step4
|
|
|
from bcr.BCR_20240201 import main, ROOT_PATH
|
|
|
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
|
|
|
|
|
|
c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID',
|
|
|
'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total']
|
|
|
|
|
|
|
|
|
def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True):
|
|
|
"""
|
|
|
ppt第二个需求修改
|
|
|
"""
|
|
|
# 2024/12/25 14:58 修改,增加了reduce_columns参数用来替换固定值
|
|
|
if reduce_columns is None:
|
|
|
reduce_columns = ['2021', '2022', '2023', 'Grand Total']
|
|
|
if keep_columns is None:
|
|
|
keep_columns = c2
|
|
|
# 处理数值类型
|
|
|
for col in reduce_columns:
|
|
|
table[col] = table[col].apply(str2float)
|
|
|
# 正常聚合
|
|
|
# 1.求和结果
|
|
|
agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum()
|
|
|
agg_result.reset_index(inplace=True) # 重置索引
|
|
|
# 2.分块
|
|
|
filter_table_is = table[table["文献类型"] == "Book"]
|
|
|
filter_table_not = table[table["文献类型"] != "Book"]
|
|
|
# 3.分别去重
|
|
|
filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
|
|
|
filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
|
|
|
# 4.合并去重保留是Book的,book的在上面,重复项保留上面的
|
|
|
merge_table = pd.concat([filter_table_is, filter_table_not])
|
|
|
merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
|
|
|
# 5.删除多于列
|
|
|
merge_table.drop(reduce_columns, axis=1, inplace=True)
|
|
|
# 重新匹配
|
|
|
result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN'])
|
|
|
result_table = result[keep_columns]
|
|
|
result['年份'] = result['年份'].astype(str)
|
|
|
result['Sort Year'] = result['Sort Year'].astype(str)
|
|
|
"""
|
|
|
# 新增的需求
|
|
|
# 以ISBN聚合,重复项保留
|
|
|
big_table = pd.DataFrame()
|
|
|
group_by = table.groupby(by=['ISBN'])
|
|
|
for _, group in group_by:
|
|
|
agg: pd.Series = group[reduce_columns].sum()
|
|
|
group_filter = group[group["文献类型"] == "Book"]
|
|
|
if group_filter.empty:
|
|
|
first = group[:1]
|
|
|
# total求和
|
|
|
else:
|
|
|
first = group_filter[:1]
|
|
|
# 替换聚合的值
|
|
|
first[reduce_columns] = agg
|
|
|
big_table = pd.concat([big_table, first])
|
|
|
|
|
|
group_by.reset_index(inplace=True)
|
|
|
"""
|
|
|
if export:
|
|
|
result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
|
|
|
|
|
|
return result_table
|
|
|
|
|
|
|
|
|
def main_change():
|
|
|
table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
|
|
|
# 测试
|
|
|
# table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t')
|
|
|
|
|
|
t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
|
|
|
table = pd.concat([table, t2])
|
|
|
table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
|
|
|
|
|
|
# 把数量统计标准化
|
|
|
table['2021'] = table['2021'].apply(str2float)
|
|
|
table['2022'] = table['2022'].apply(str2float)
|
|
|
table['2023'] = table['2023'].apply(str2float)
|
|
|
table['Grand Total'] = table['Grand Total'].apply(str2float)
|
|
|
step2_table = step2_change(table, export=True)
|
|
|
# step3_table, no_data_table = step3(step2_table, export=True)
|
|
|
# step4(no_data_table)
|
|
|
|
|
|
|
|
|
def change_field_type():
|
|
|
table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl')
|
|
|
table['年份'] = table['年份'].apply(str2int)
|
|
|
table['Sort Year'] = table['Sort Year'].apply(str2int)
|
|
|
table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main_change()
|
|
|
# change_field_type()
|