You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

205 lines
8.0 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2024/2/2 10:45
# @Author : zhaoxiangpeng
# @File : BCR_20240201.py
import os
from copy import deepcopy
import pandas as pd
import data_process_tool
from bcr.utils import read_file, str2float, str2int
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
# ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR"
# ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403"
# ROOT_PATH = "Y:\\BCR\\202407"
C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN']
def main():
table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
table = pd.concat([table, t2])
table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
# 把数量统计标准化
table['2021'] = table['2021'].apply(str2float)
table['2022'] = table['2022'].apply(str2float)
table['2023'] = table['2023'].apply(str2float)
table['Grand Total'] = table['Grand Total'].apply(str2float)
step2_table = step2(table, export=True)
step3_table, no_data_table = step3(step2_table, export=True)
step4(no_data_table)
def process1(table: pd.DataFrame):
TABLE2 = deepcopy(table)
# 表头重命名
# new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other')
# TABLE2.rename(columns=new_columns, inplace=True)
# 根据doi去重只保留一个用于doi匹配
DOI_PROED = TABLE2.dropna(subset=['DOI'])
DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True)
# 把doi为空的删掉没有doi的用isbn匹配
ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()]
ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True)
return DOI_PROED, ISBN_PROED
def process_func2(table: pd.DataFrame):
"""
isbn分列
"""
TABLE2 = deepcopy(table)
TABLE2['ISBN'] = TABLE2['ISBN'].astype(str) # 要转为str类型不然会分不到
ISBNs = TABLE2['ISBN'].str.split('; ', expand=True)
ISBNs = ISBNs.stack() # 把行转成列
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
ISBNs.name = 'ISBN'
EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
return EID_PROED
def process_func3(export: bool = True):
"""
合并两个scopus表
"""
keep_columns = [
'Title', 'Scopus ID',
'Print ISBN', 'E-ISBN', 'Other ISBN',
'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher',
'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4',
]
export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx")
if not os.path.exists(export_file_path):
table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx')
table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx')
table1 = pd.read_excel(table1_path, sheet_name=0)
table1 = table1[keep_columns]
table2 = pd.read_excel(table2_path, sheet_name=0)
table2 = table2[keep_columns]
table0 = pd.concat([table1, table2])
table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True)
table0['Scopus ID'] = table0['Scopus ID'].astype(str)
if export:
table0.to_excel(export_file_path, index=False)
else:
table0 = pd.read_excel(export_file_path, sheet_name=0)
return table0
def step2(table: pd.DataFrame, export: bool = True):
"""
ppt第二个需求
"""
group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum()
group_by.reset_index(inplace=True)
keep_columns = table[['DOI', 'ISBN']]
keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
if export:
result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
return result_table
def step3(table: pd.DataFrame, export: bool = True):
"""
ISBN合并记录与上一年BCR总表匹配
按DOI、ISBN的顺序匹配匹配到的记录二个表的字段合并
"""
BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx",
sheet_name=0) # 上一年的BCR总表
# 表头加标记
new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other')
BASE_FILE.rename(columns=new_columns, inplace=True)
doi_table, isbn_table = process1(table)
doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other'])
"""
# 把doi分成有数据的和没数据的
has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据
no_data = doi_[doi_['DOI-Other'].isnull()] # 使用doi没有匹配到的
# del doi_
no_data = no_data[table.columns.values.tolist()] # 把没有数据的多余列去除
"""
# 用没有匹配到doi的数据用isbn进行匹配
isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了
# 合并doi匹配结果和isbn的结果
result_table = pd.concat([doi_, isbn_])
if export:
result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False)
# 通过doi和isbn都没有匹配到的
all_no_data = result_table[result_table['ISBN-Other'].isnull()]
all_no_data = all_no_data[table.columns.values.tolist()] # 保留基础列
if export:
all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False)
return result_table, all_no_data
def step4(table: pd.DataFrame, export: bool = True):
"""
ISBN合并记录与上一年BCR总表不匹配记录处理
与SCOPUS来源书目匹配
把二个表的ISBN分列进行交叉匹配
把二个表的字段进行合并
再与OASIS记录匹配
获取作者、学科分类数据
删除敏感书目
"""
df1 = process_func2(table) # 不匹配记录
df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True)
df1['ISBN'] = df1['ISBN'].astype(str)
print(df1)
df2 = process_func3(export=export)
for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']:
df2[col] = df2[col].astype(str)
c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left')
c1_in = c1[c1['Print ISBN'].notnull()]
c1_not = c1[c1['Print ISBN'].isnull()]
c1_not = c1_not[C_COLUMNS]
c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left')
c2_in = c2[c2['E-ISBN'].notnull()]
c2_not = c2[c2['E-ISBN'].isnull()]
c2_not = c2_not[C_COLUMNS]
c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left')
c3_in = c3[c3['Other ISBN'].notnull()]
c3_not = c3[c3['Other ISBN'].isnull()]
# 3次匹配结果合并
r1_in = pd.concat([c1_in, c2_in, c3_in])
r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True)
r1_not = c3_not
r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False)
r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']]
r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True)
if export:
r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False)
r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False)
if __name__ == '__main__':
main()
"""
step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0)
step3_table, no_data_table = step3(step2_table, export=True)
step4(no_data_table)
"""
# ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0)
# step4(ste3_table)