data_process/bcr/BCR_20241224.py

# -*- coding: utf-8 -*-
# @Time    : 2024/12/24 15:03
# @Author  : zhaoxiangpeng
# @File    : BCR_20241224.py

import os
import re
import warnings
import chardet
import pandas as pd
from loguru import logger
from bcr.utils import read_file, str2float, export_small_file
import bcr.BCR_20240724 as bcr_20240724
import bcr.BCR_20240201 as bcr_20240201
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH


def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame:
    """
    补充失败的记录重新采集
    """
    extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0)
    if isinstance(base_table, pd.DataFrame):
        # 主表只保留eid用来对补数据的表进行去重
        dup_table = base_table[['EID']]
        dup_table.drop_duplicates(subset=['EID'], inplace=True)
        # eid列改名，防止有冲突
        dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True)
        # 扩展表的EID和主表的dup_eid列进行左连接，结果表dup_eid为空的的就是需要补充的行
        duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid'])
        duped_table = duped_table[duped_table['dup_eid'].isnull()]
        duped_table.drop(columns=['dup_eid'], inplace=True)
        # 删除用来匹配的列
        all_data_table = pd.concat([base_table, duped_table])
        return all_data_table
    return extend_table


def step1_merge():
    path = 'Y:\\BCR\\2025BCR'
    path2 = os.path.join(path, 'MergeFile')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        # small_table = small_table[['EID']]
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv')
    big_table = pd.concat([big_table, small_table])
    return big_table


def step1_merge_change():
    """
    处理补采的文件
    """
    path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        big_table = pd.concat([big_table, small_table])
    return task_change1(big_table)


def step2_change(table: pd.DataFrame, export: bool = True):
    # 正常聚合
    # 1.求和结果
    # 求和前要先把数字类型给统一了
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    # 把相同ISBN的记录合并成一条记录，多条记录的各年份和GrandTotal引用次数求和
    agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum()
    agg_result.reset_index(inplace=True)  # 重置索引
    # 2.分块
    filter_table_is = table[table["文献类型"] == "Book"]
    filter_table_not = table[table["文献类型"] != "Book"]
    filter_table_is[KEEP_COLUMNS]


def main():
    STEP_IS_EXIST = True
    if STEP_IS_EXIST:
        table = step1_merge_change()

    # 判断表2的结果是否存在的逻辑
    step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx")
    if not os.path.exists(step_2_table_path):
        step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS,
                                                export=True)
    else:
        step2_table = pd.read_excel(step_2_table_path, sheet_name=0)

    # 第三步表结果是否存在的逻辑
    no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx')
    if not os.path.exists(no_data_table_path):
        step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True)
    else:
        no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0)

    # 处理第4步
    bcr_20240201.step4(no_data_table)


if __name__ == '__main__':
    main()