|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2024/12/24 15:03
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : BCR_20241224.py
|
|
|
|
|
|
import os
|
|
|
import re
|
|
|
import warnings
|
|
|
import chardet
|
|
|
import pandas as pd
|
|
|
from loguru import logger
|
|
|
from bcr.utils import read_file, str2float, export_small_file
|
|
|
import bcr.BCR_20240724 as bcr_20240724
|
|
|
import bcr.BCR_20240201 as bcr_20240201
|
|
|
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
|
|
|
|
|
|
|
|
|
def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame:
|
|
|
"""
|
|
|
补充失败的记录重新采集
|
|
|
"""
|
|
|
extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0)
|
|
|
if isinstance(base_table, pd.DataFrame):
|
|
|
# 主表只保留eid用来对补数据的表进行去重
|
|
|
dup_table = base_table[['EID']]
|
|
|
dup_table.drop_duplicates(subset=['EID'], inplace=True)
|
|
|
# eid列改名,防止有冲突
|
|
|
dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True)
|
|
|
# 扩展表的EID和主表的dup_eid列进行左连接,结果表dup_eid为空的的就是需要补充的行
|
|
|
duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid'])
|
|
|
duped_table = duped_table[duped_table['dup_eid'].isnull()]
|
|
|
duped_table.drop(columns=['dup_eid'], inplace=True)
|
|
|
# 删除用来匹配的列
|
|
|
all_data_table = pd.concat([base_table, duped_table])
|
|
|
return all_data_table
|
|
|
return extend_table
|
|
|
|
|
|
|
|
|
def step1_merge():
|
|
|
path = 'Y:\\BCR\\2025BCR'
|
|
|
path2 = os.path.join(path, 'MergeFile')
|
|
|
files = os.listdir(path2)
|
|
|
big_table = pd.DataFrame()
|
|
|
for file in files:
|
|
|
file_full_path = os.path.join(path2, file)
|
|
|
small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
|
|
|
# small_table = small_table[['EID']]
|
|
|
print(small_table.shape)
|
|
|
big_table = pd.concat([big_table, small_table])
|
|
|
small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv')
|
|
|
big_table = pd.concat([big_table, small_table])
|
|
|
return big_table
|
|
|
|
|
|
|
|
|
def step1_merge_change():
|
|
|
"""
|
|
|
处理补采的文件
|
|
|
"""
|
|
|
path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果')
|
|
|
files = os.listdir(path2)
|
|
|
big_table = pd.DataFrame()
|
|
|
for file in files:
|
|
|
file_full_path = os.path.join(path2, file)
|
|
|
small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
|
|
|
big_table = pd.concat([big_table, small_table])
|
|
|
return task_change1(big_table)
|
|
|
|
|
|
|
|
|
def step2_change(table: pd.DataFrame, export: bool = True):
|
|
|
# 正常聚合
|
|
|
# 1.求和结果
|
|
|
# 求和前要先把数字类型给统一了
|
|
|
table['2021'] = table['2021'].apply(str2float)
|
|
|
table['2022'] = table['2022'].apply(str2float)
|
|
|
table['2023'] = table['2023'].apply(str2float)
|
|
|
table['Grand Total'] = table['Grand Total'].apply(str2float)
|
|
|
# 把相同ISBN的记录合并成一条记录,多条记录的各年份和GrandTotal引用次数求和
|
|
|
agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum()
|
|
|
agg_result.reset_index(inplace=True) # 重置索引
|
|
|
# 2.分块
|
|
|
filter_table_is = table[table["文献类型"] == "Book"]
|
|
|
filter_table_not = table[table["文献类型"] != "Book"]
|
|
|
filter_table_is[KEEP_COLUMNS]
|
|
|
|
|
|
|
|
|
def main():
|
|
|
STEP_IS_EXIST = True
|
|
|
if STEP_IS_EXIST:
|
|
|
table = step1_merge_change()
|
|
|
|
|
|
# 判断表2的结果是否存在的逻辑
|
|
|
step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx")
|
|
|
if not os.path.exists(step_2_table_path):
|
|
|
step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS,
|
|
|
export=True)
|
|
|
else:
|
|
|
step2_table = pd.read_excel(step_2_table_path, sheet_name=0)
|
|
|
|
|
|
# 第三步表结果是否存在的逻辑
|
|
|
no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx')
|
|
|
if not os.path.exists(no_data_table_path):
|
|
|
step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True)
|
|
|
else:
|
|
|
no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0)
|
|
|
|
|
|
# 处理第4步
|
|
|
bcr_20240201.step4(no_data_table)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|