|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2024/2/2 10:45
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : BCR_20240201.py
|
|
|
|
|
|
import os
|
|
|
from copy import deepcopy
|
|
|
import pandas as pd
|
|
|
import data_process_tool
|
|
|
|
|
|
from bcr.utils import read_file, str2float, str2int
|
|
|
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
|
|
|
|
|
|
# ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR"
|
|
|
# ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403"
|
|
|
# ROOT_PATH = "Y:\\BCR\\202407"
|
|
|
|
|
|
C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN']
|
|
|
|
|
|
|
|
|
def main():
|
|
|
table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
|
|
|
|
|
|
t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
|
|
|
table = pd.concat([table, t2])
|
|
|
table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
|
|
|
|
|
|
# 把数量统计标准化
|
|
|
table['2021'] = table['2021'].apply(str2float)
|
|
|
table['2022'] = table['2022'].apply(str2float)
|
|
|
table['2023'] = table['2023'].apply(str2float)
|
|
|
table['Grand Total'] = table['Grand Total'].apply(str2float)
|
|
|
step2_table = step2(table, export=True)
|
|
|
step3_table, no_data_table = step3(step2_table, export=True)
|
|
|
step4(no_data_table)
|
|
|
|
|
|
|
|
|
def process1(table: pd.DataFrame):
|
|
|
TABLE2 = deepcopy(table)
|
|
|
# 表头重命名
|
|
|
# new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other')
|
|
|
# TABLE2.rename(columns=new_columns, inplace=True)
|
|
|
|
|
|
# 根据doi去重只保留一个用于doi匹配
|
|
|
DOI_PROED = TABLE2.dropna(subset=['DOI'])
|
|
|
DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True)
|
|
|
# 把doi为空的删掉,没有doi的用isbn匹配
|
|
|
ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()]
|
|
|
ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True)
|
|
|
|
|
|
return DOI_PROED, ISBN_PROED
|
|
|
|
|
|
|
|
|
def process_func2(table: pd.DataFrame):
|
|
|
"""
|
|
|
isbn分列
|
|
|
"""
|
|
|
TABLE2 = deepcopy(table)
|
|
|
TABLE2['ISBN'] = TABLE2['ISBN'].astype(str) # 要转为str类型,不然会分不到
|
|
|
ISBNs = TABLE2['ISBN'].str.split('; ', expand=True)
|
|
|
ISBNs = ISBNs.stack() # 把行转成列
|
|
|
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
|
|
|
ISBNs.name = 'ISBN'
|
|
|
EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
|
|
|
return EID_PROED
|
|
|
|
|
|
|
|
|
def process_func3(export: bool = True):
|
|
|
"""
|
|
|
合并两个scopus表
|
|
|
"""
|
|
|
keep_columns = [
|
|
|
'Title', 'Scopus ID',
|
|
|
'Print ISBN', 'E-ISBN', 'Other ISBN',
|
|
|
'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher',
|
|
|
'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4',
|
|
|
]
|
|
|
export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx")
|
|
|
if not os.path.exists(export_file_path):
|
|
|
table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx')
|
|
|
table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx')
|
|
|
table1 = pd.read_excel(table1_path, sheet_name=0)
|
|
|
table1 = table1[keep_columns]
|
|
|
table2 = pd.read_excel(table2_path, sheet_name=0)
|
|
|
table2 = table2[keep_columns]
|
|
|
table0 = pd.concat([table1, table2])
|
|
|
table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True)
|
|
|
table0['Scopus ID'] = table0['Scopus ID'].astype(str)
|
|
|
if export:
|
|
|
table0.to_excel(export_file_path, index=False)
|
|
|
else:
|
|
|
table0 = pd.read_excel(export_file_path, sheet_name=0)
|
|
|
return table0
|
|
|
|
|
|
|
|
|
def step2(table: pd.DataFrame, export: bool = True):
|
|
|
"""
|
|
|
ppt第二个需求
|
|
|
"""
|
|
|
group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum()
|
|
|
group_by.reset_index(inplace=True)
|
|
|
|
|
|
keep_columns = table[['DOI', 'ISBN']]
|
|
|
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
|
|
|
|
|
|
result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
|
|
|
if export:
|
|
|
result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
|
|
|
|
|
|
return result_table
|
|
|
|
|
|
|
|
|
def step3(table: pd.DataFrame, export: bool = True):
|
|
|
"""
|
|
|
ISBN合并记录与上一年BCR总表匹配
|
|
|
按DOI、ISBN的顺序匹配,匹配到的记录,二个表的字段合并
|
|
|
"""
|
|
|
BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx",
|
|
|
sheet_name=0) # 上一年的BCR总表
|
|
|
# 表头加标记
|
|
|
new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other')
|
|
|
BASE_FILE.rename(columns=new_columns, inplace=True)
|
|
|
|
|
|
doi_table, isbn_table = process1(table)
|
|
|
doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other'])
|
|
|
"""
|
|
|
# 把doi分成有数据的和没数据的
|
|
|
has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据
|
|
|
no_data = doi_[doi_['DOI-Other'].isnull()] # 使用doi没有匹配到的
|
|
|
# del doi_
|
|
|
no_data = no_data[table.columns.values.tolist()] # 把没有数据的多余列去除
|
|
|
"""
|
|
|
# 用没有匹配到doi的数据用isbn进行匹配
|
|
|
isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了
|
|
|
# 合并doi匹配结果和isbn的结果
|
|
|
result_table = pd.concat([doi_, isbn_])
|
|
|
if export:
|
|
|
result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False)
|
|
|
|
|
|
# 通过doi和isbn都没有匹配到的
|
|
|
all_no_data = result_table[result_table['ISBN-Other'].isnull()]
|
|
|
all_no_data = all_no_data[table.columns.values.tolist()] # 保留基础列
|
|
|
if export:
|
|
|
all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False)
|
|
|
|
|
|
return result_table, all_no_data
|
|
|
|
|
|
|
|
|
def step4(table: pd.DataFrame, export: bool = True):
|
|
|
"""
|
|
|
ISBN合并记录与上一年BCR总表不匹配记录处理
|
|
|
与SCOPUS来源书目匹配
|
|
|
把二个表的ISBN分列,进行交叉匹配
|
|
|
把二个表的字段进行合并
|
|
|
再与OASIS记录匹配
|
|
|
获取作者、学科分类数据
|
|
|
删除敏感书目
|
|
|
"""
|
|
|
df1 = process_func2(table) # 不匹配记录
|
|
|
df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True)
|
|
|
df1['ISBN'] = df1['ISBN'].astype(str)
|
|
|
print(df1)
|
|
|
|
|
|
df2 = process_func3(export=export)
|
|
|
for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']:
|
|
|
df2[col] = df2[col].astype(str)
|
|
|
|
|
|
c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left')
|
|
|
c1_in = c1[c1['Print ISBN'].notnull()]
|
|
|
c1_not = c1[c1['Print ISBN'].isnull()]
|
|
|
c1_not = c1_not[C_COLUMNS]
|
|
|
|
|
|
c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left')
|
|
|
c2_in = c2[c2['E-ISBN'].notnull()]
|
|
|
c2_not = c2[c2['E-ISBN'].isnull()]
|
|
|
c2_not = c2_not[C_COLUMNS]
|
|
|
|
|
|
c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left')
|
|
|
|
|
|
c3_in = c3[c3['Other ISBN'].notnull()]
|
|
|
c3_not = c3[c3['Other ISBN'].isnull()]
|
|
|
|
|
|
# 3次匹配结果合并
|
|
|
r1_in = pd.concat([c1_in, c2_in, c3_in])
|
|
|
r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True)
|
|
|
|
|
|
r1_not = c3_not
|
|
|
r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False)
|
|
|
r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']]
|
|
|
r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True)
|
|
|
if export:
|
|
|
r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False)
|
|
|
r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
"""
|
|
|
step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0)
|
|
|
step3_table, no_data_table = step3(step2_table, export=True)
|
|
|
step4(no_data_table)
|
|
|
"""
|
|
|
# ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0)
|
|
|
# step4(ste3_table)
|