data_process/bcr/BCR_20240201.py

# -*- coding: utf-8 -*-
# @Time    : 2024/2/2 10:45
# @Author  : zhaoxiangpeng
# @File    : BCR_20240201.py

import os
from copy import deepcopy
import pandas as pd
import data_process_tool

from bcr.utils import read_file, str2float, str2int
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH

# ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR"
# ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403"
# ROOT_PATH = "Y:\\BCR\\202407"

C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN']


def main():
    table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))

    t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
    table = pd.concat([table, t2])
    table.drop_duplicates(subset=['EID'], keep='last', inplace=True)

    # 把数量统计标准化
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    step2_table = step2(table, export=True)
    step3_table, no_data_table = step3(step2_table, export=True)
    step4(no_data_table)


def process1(table: pd.DataFrame):
    TABLE2 = deepcopy(table)
    # 表头重命名
    # new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other')
    # TABLE2.rename(columns=new_columns, inplace=True)

    # 根据doi去重只保留一个用于doi匹配
    DOI_PROED = TABLE2.dropna(subset=['DOI'])
    DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True)
    # 把doi为空的删掉，没有doi的用isbn匹配
    ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()]
    ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True)

    return DOI_PROED, ISBN_PROED


def process_func2(table: pd.DataFrame):
    """
    isbn分列
    """
    TABLE2 = deepcopy(table)
    TABLE2['ISBN'] = TABLE2['ISBN'].astype(str)  # 要转为str类型，不然会分不到
    ISBNs = TABLE2['ISBN'].str.split('; ', expand=True)
    ISBNs = ISBNs.stack()  # 把行转成列
    ISBNs = ISBNs.reset_index(level=1, drop=True)  # 重置索引, 并删除多余的索引
    ISBNs.name = 'ISBN'
    EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
    return EID_PROED


def process_func3(export: bool = True):
    """
    合并两个scopus表
    """
    keep_columns = [
        'Title', 'Scopus ID',
        'Print ISBN', 'E-ISBN', 'Other ISBN',
        'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher',
        'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4',
    ]
    export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx")
    if not os.path.exists(export_file_path):
        table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx')
        table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx')
        table1 = pd.read_excel(table1_path, sheet_name=0)
        table1 = table1[keep_columns]
        table2 = pd.read_excel(table2_path, sheet_name=0)
        table2 = table2[keep_columns]
        table0 = pd.concat([table1, table2])
        table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True)
        table0['Scopus ID'] = table0['Scopus ID'].astype(str)
        if export:
            table0.to_excel(export_file_path, index=False)
    else:
        table0 = pd.read_excel(export_file_path, sheet_name=0)
    return table0


def step2(table: pd.DataFrame, export: bool = True):
    """
    ppt第二个需求
    """
    group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum()
    group_by.reset_index(inplace=True)

    keep_columns = table[['DOI', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)

    result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)

    return result_table


def step3(table: pd.DataFrame, export: bool = True):
    """
    ISBN合并记录与上一年BCR总表匹配
    按DOI、ISBN的顺序匹配，匹配到的记录，二个表的字段合并
    """
    BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx",
                              sheet_name=0)  # 上一年的BCR总表
    # 表头加标记
    new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other')
    BASE_FILE.rename(columns=new_columns, inplace=True)

    doi_table, isbn_table = process1(table)
    doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other'])
    """
    # 把doi分成有数据的和没数据的
    has_data = doi_[doi_['DOI-Other'].notnull()]  # 匹配到数据
    no_data = doi_[doi_['DOI-Other'].isnull()]  # 使用doi没有匹配到的
    # del doi_
    no_data = no_data[table.columns.values.tolist()]  # 把没有数据的多余列去除
    """
    # 用没有匹配到doi的数据用isbn进行匹配
    isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other'])  # 这些就不用考虑没有匹配到的了，因为没有剩下的条件了
    # 合并doi匹配结果和isbn的结果
    result_table = pd.concat([doi_, isbn_])
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False)

    # 通过doi和isbn都没有匹配到的
    all_no_data = result_table[result_table['ISBN-Other'].isnull()]
    all_no_data = all_no_data[table.columns.values.tolist()]  # 保留基础列
    if export:
        all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False)

    return result_table, all_no_data


def step4(table: pd.DataFrame, export: bool = True):
    """
    ISBN合并记录与上一年BCR总表不匹配记录处理
    与SCOPUS来源书目匹配
        把二个表的ISBN分列，进行交叉匹配
        把二个表的字段进行合并
    再与OASIS记录匹配
        获取作者、学科分类数据
        删除敏感书目
    """
    df1 = process_func2(table)  # 不匹配记录
    df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True)
    df1['ISBN'] = df1['ISBN'].astype(str)
    print(df1)

    df2 = process_func3(export=export)
    for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']:
        df2[col] = df2[col].astype(str)

    c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left')
    c1_in = c1[c1['Print ISBN'].notnull()]
    c1_not = c1[c1['Print ISBN'].isnull()]
    c1_not = c1_not[C_COLUMNS]

    c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left')
    c2_in = c2[c2['E-ISBN'].notnull()]
    c2_not = c2[c2['E-ISBN'].isnull()]
    c2_not = c2_not[C_COLUMNS]

    c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left')

    c3_in = c3[c3['Other ISBN'].notnull()]
    c3_not = c3[c3['Other ISBN'].isnull()]

    # 3次匹配结果合并
    r1_in = pd.concat([c1_in, c2_in, c3_in])
    r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True)

    r1_not = c3_not
    r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False)
    r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']]
    r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True)
    if export:
        r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False)
        r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False)


if __name__ == '__main__':
    main()
    """
    step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0)
    step3_table, no_data_table = step3(step2_table, export=True)
    step4(no_data_table)
    """
    # ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0)
    # step4(ste3_table)