# -*- coding: utf-8 -*- # @Time : 2024/2/2 10:45 # @Author : zhaoxiangpeng # @File : BCR_20240201.py import os from copy import deepcopy import pandas as pd import data_process_tool from bcr.utils import read_file, str2float, str2int from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH # ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR" # ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403" # ROOT_PATH = "Y:\\BCR\\202407" C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN'] def main(): table = read_file(os.path.join(ROOT_PATH, 'MergeFile')) t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t') table = pd.concat([table, t2]) table.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 把数量统计标准化 table['2021'] = table['2021'].apply(str2float) table['2022'] = table['2022'].apply(str2float) table['2023'] = table['2023'].apply(str2float) table['Grand Total'] = table['Grand Total'].apply(str2float) step2_table = step2(table, export=True) step3_table, no_data_table = step3(step2_table, export=True) step4(no_data_table) def process1(table: pd.DataFrame): TABLE2 = deepcopy(table) # 表头重命名 # new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other') # TABLE2.rename(columns=new_columns, inplace=True) # 根据doi去重只保留一个用于doi匹配 DOI_PROED = TABLE2.dropna(subset=['DOI']) DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True) # 把doi为空的删掉,没有doi的用isbn匹配 ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()] ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True) return DOI_PROED, ISBN_PROED def process_func2(table: pd.DataFrame): """ isbn分列 """ TABLE2 = deepcopy(table) TABLE2['ISBN'] = TABLE2['ISBN'].astype(str) # 要转为str类型,不然会分不到 ISBNs = TABLE2['ISBN'].str.split('; ', expand=True) ISBNs = ISBNs.stack() # 把行转成列 ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 ISBNs.name = 'ISBN' EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs) return EID_PROED def process_func3(export: bool = True): """ 合并两个scopus表 """ keep_columns = [ 'Title', 'Scopus ID', 'Print ISBN', 'E-ISBN', 'Other ISBN', 'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher', 'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4', ] export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx") if not os.path.exists(export_file_path): table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx') table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx') table1 = pd.read_excel(table1_path, sheet_name=0) table1 = table1[keep_columns] table2 = pd.read_excel(table2_path, sheet_name=0) table2 = table2[keep_columns] table0 = pd.concat([table1, table2]) table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True) table0['Scopus ID'] = table0['Scopus ID'].astype(str) if export: table0.to_excel(export_file_path, index=False) else: table0 = pd.read_excel(export_file_path, sheet_name=0) return table0 def step2(table: pd.DataFrame, export: bool = True): """ ppt第二个需求 """ group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum() group_by.reset_index(inplace=True) keep_columns = table[['DOI', 'ISBN']] keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) if export: result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False) return result_table def step3(table: pd.DataFrame, export: bool = True): """ ISBN合并记录与上一年BCR总表匹配 按DOI、ISBN的顺序匹配,匹配到的记录,二个表的字段合并 """ BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx", sheet_name=0) # 上一年的BCR总表 # 表头加标记 new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other') BASE_FILE.rename(columns=new_columns, inplace=True) doi_table, isbn_table = process1(table) doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other']) """ # 把doi分成有数据的和没数据的 has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据 no_data = doi_[doi_['DOI-Other'].isnull()] # 使用doi没有匹配到的 # del doi_ no_data = no_data[table.columns.values.tolist()] # 把没有数据的多余列去除 """ # 用没有匹配到doi的数据用isbn进行匹配 isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了 # 合并doi匹配结果和isbn的结果 result_table = pd.concat([doi_, isbn_]) if export: result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False) # 通过doi和isbn都没有匹配到的 all_no_data = result_table[result_table['ISBN-Other'].isnull()] all_no_data = all_no_data[table.columns.values.tolist()] # 保留基础列 if export: all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False) return result_table, all_no_data def step4(table: pd.DataFrame, export: bool = True): """ ISBN合并记录与上一年BCR总表不匹配记录处理 与SCOPUS来源书目匹配 把二个表的ISBN分列,进行交叉匹配 把二个表的字段进行合并 再与OASIS记录匹配 获取作者、学科分类数据 删除敏感书目 """ df1 = process_func2(table) # 不匹配记录 df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True) df1['ISBN'] = df1['ISBN'].astype(str) print(df1) df2 = process_func3(export=export) for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']: df2[col] = df2[col].astype(str) c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left') c1_in = c1[c1['Print ISBN'].notnull()] c1_not = c1[c1['Print ISBN'].isnull()] c1_not = c1_not[C_COLUMNS] c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left') c2_in = c2[c2['E-ISBN'].notnull()] c2_not = c2[c2['E-ISBN'].isnull()] c2_not = c2_not[C_COLUMNS] c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left') c3_in = c3[c3['Other ISBN'].notnull()] c3_not = c3[c3['Other ISBN'].isnull()] # 3次匹配结果合并 r1_in = pd.concat([c1_in, c2_in, c3_in]) r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True) r1_not = c3_not r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False) r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']] r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True) if export: r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False) r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False) if __name__ == '__main__': main() """ step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0) step3_table, no_data_table = step3(step2_table, export=True) step4(no_data_table) """ # ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0) # step4(ste3_table)