# -*- coding: utf-8 -*- # @Time : 2024/12/24 15:03 # @Author : zhaoxiangpeng # @File : BCR_20241224.py import os import re import warnings import chardet import pandas as pd from loguru import logger from bcr.utils import read_file, str2float, export_small_file import bcr.BCR_20240724 as bcr_20240724 import bcr.BCR_20240201 as bcr_20240201 from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame: """ 补充失败的记录重新采集 """ extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0) if isinstance(base_table, pd.DataFrame): # 主表只保留eid用来对补数据的表进行去重 dup_table = base_table[['EID']] dup_table.drop_duplicates(subset=['EID'], inplace=True) # eid列改名,防止有冲突 dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True) # 扩展表的EID和主表的dup_eid列进行左连接,结果表dup_eid为空的的就是需要补充的行 duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid']) duped_table = duped_table[duped_table['dup_eid'].isnull()] duped_table.drop(columns=['dup_eid'], inplace=True) # 删除用来匹配的列 all_data_table = pd.concat([base_table, duped_table]) return all_data_table return extend_table def step1_merge(): path = 'Y:\\BCR\\2025BCR' path2 = os.path.join(path, 'MergeFile') files = os.listdir(path2) big_table = pd.DataFrame() for file in files: file_full_path = os.path.join(path2, file) small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0) # small_table = small_table[['EID']] print(small_table.shape) big_table = pd.concat([big_table, small_table]) small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv') big_table = pd.concat([big_table, small_table]) return big_table def step1_merge_change(): """ 处理补采的文件 """ path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果') files = os.listdir(path2) big_table = pd.DataFrame() for file in files: file_full_path = os.path.join(path2, file) small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0) big_table = pd.concat([big_table, small_table]) return task_change1(big_table) def step2_change(table: pd.DataFrame, export: bool = True): # 正常聚合 # 1.求和结果 # 求和前要先把数字类型给统一了 table['2021'] = table['2021'].apply(str2float) table['2022'] = table['2022'].apply(str2float) table['2023'] = table['2023'].apply(str2float) table['Grand Total'] = table['Grand Total'].apply(str2float) # 把相同ISBN的记录合并成一条记录,多条记录的各年份和GrandTotal引用次数求和 agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum() agg_result.reset_index(inplace=True) # 重置索引 # 2.分块 filter_table_is = table[table["文献类型"] == "Book"] filter_table_not = table[table["文献类型"] != "Book"] filter_table_is[KEEP_COLUMNS] def main(): STEP_IS_EXIST = True if STEP_IS_EXIST: table = step1_merge_change() # 判断表2的结果是否存在的逻辑 step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx") if not os.path.exists(step_2_table_path): step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS, export=True) else: step2_table = pd.read_excel(step_2_table_path, sheet_name=0) # 第三步表结果是否存在的逻辑 no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx') if not os.path.exists(no_data_table_path): step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True) else: no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0) # 处理第4步 bcr_20240201.step4(no_data_table) if __name__ == '__main__': main()