You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

112 lines
4.3 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2024/12/24 15:03
# @Author : zhaoxiangpeng
# @File : BCR_20241224.py
import os
import re
import warnings
import chardet
import pandas as pd
from loguru import logger
from bcr.utils import read_file, str2float, export_small_file
import bcr.BCR_20240724 as bcr_20240724
import bcr.BCR_20240201 as bcr_20240201
from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame:
"""
补充失败的记录重新采集
"""
extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0)
if isinstance(base_table, pd.DataFrame):
# 主表只保留eid用来对补数据的表进行去重
dup_table = base_table[['EID']]
dup_table.drop_duplicates(subset=['EID'], inplace=True)
# eid列改名防止有冲突
dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True)
# 扩展表的EID和主表的dup_eid列进行左连接结果表dup_eid为空的的就是需要补充的行
duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid'])
duped_table = duped_table[duped_table['dup_eid'].isnull()]
duped_table.drop(columns=['dup_eid'], inplace=True)
# 删除用来匹配的列
all_data_table = pd.concat([base_table, duped_table])
return all_data_table
return extend_table
def step1_merge():
path = 'Y:\\BCR\\2025BCR'
path2 = os.path.join(path, 'MergeFile')
files = os.listdir(path2)
big_table = pd.DataFrame()
for file in files:
file_full_path = os.path.join(path2, file)
small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
# small_table = small_table[['EID']]
print(small_table.shape)
big_table = pd.concat([big_table, small_table])
small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv')
big_table = pd.concat([big_table, small_table])
return big_table
def step1_merge_change():
"""
处理补采的文件
"""
path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果')
files = os.listdir(path2)
big_table = pd.DataFrame()
for file in files:
file_full_path = os.path.join(path2, file)
small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
big_table = pd.concat([big_table, small_table])
return task_change1(big_table)
def step2_change(table: pd.DataFrame, export: bool = True):
# 正常聚合
# 1.求和结果
# 求和前要先把数字类型给统一了
table['2021'] = table['2021'].apply(str2float)
table['2022'] = table['2022'].apply(str2float)
table['2023'] = table['2023'].apply(str2float)
table['Grand Total'] = table['Grand Total'].apply(str2float)
# 把相同ISBN的记录合并成一条记录多条记录的各年份和GrandTotal引用次数求和
agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum()
agg_result.reset_index(inplace=True) # 重置索引
# 2.分块
filter_table_is = table[table["文献类型"] == "Book"]
filter_table_not = table[table["文献类型"] != "Book"]
filter_table_is[KEEP_COLUMNS]
def main():
STEP_IS_EXIST = True
if STEP_IS_EXIST:
table = step1_merge_change()
# 判断表2的结果是否存在的逻辑
step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx")
if not os.path.exists(step_2_table_path):
step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS,
export=True)
else:
step2_table = pd.read_excel(step_2_table_path, sheet_name=0)
# 第三步表结果是否存在的逻辑
no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx')
if not os.path.exists(no_data_table_path):
step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True)
else:
no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0)
# 处理第4步
bcr_20240201.step4(no_data_table)
if __name__ == '__main__':
main()