# -*- coding: utf-8 -*- # @Time : 2023/5/24 9:36 # @Author : zhaoxiangpeng # @File : record_20230524.py # 2023-05-24 BCR处理 # 需求 # 根据doi、isbn补充eid去重表的字段;优先doi,其次isbn;出现重复时只保留一个 # 实现 # 1.先把大文件(eid去重表)根据isbn和eid去重,isbn是多个要先分开后再进行去重,只保留一个 # 2.先对doi进行连接,把没有匹配到的用isbn进行连接,之后两次连接合并 # 缓存文件y盘有 import os import openpyxl from typing import Union, Tuple import pandas as pd import data_process_tool BASE_PATH = 'F:/工作数据存储2023/20230517_一年一度的BCR/' HEAD = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', '出版阶段', '开放获取', '来源出版物', 'EID', 'Scopus ID', 'Title', 'Author', 'Author ID', 'Sort Year', '2020', '2021', '2022', 'GrandTotal', '访问类型'] def slice_read(path) -> pd.DataFrame: # 打开Excel文件,使用read_only模式 workbook = openpyxl.load_workbook(filename=path, read_only=True) # 使用worksheet.iter_rows()方法读取单元格数据 # 获取第一个worksheet对象 worksheet = workbook.worksheets[0] dataArray = [] # 遍历单元格并读取数据 for row in worksheet.iter_rows(max_col=40, values_only=True): dataArray.append(row) # 关闭Workbook对象 workbook.close() table = pd.DataFrame(data=dataArray, columns=HEAD) return table def chunk_read(): pd.read_csv() def step0() -> pd.DataFrame: """合并 eid去重保留最大grandTotal 压缩文件""" cache = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv') if os.path.exists(cache): return data_process_tool.read_data(cache) COMPRESS_PATH = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal-csv') compress_files = os.listdir(COMPRESS_PATH) big_join_file = pd.DataFrame() compress_files.pop() for compress_file in compress_files: data = data_process_tool.read_data( os.path.join(COMPRESS_PATH, compress_file), # low_memory=False ) big_join_file = pd.concat([big_join_file, data]) # big_join_file.append( # data_process_tool.read_data( # os.path.join(COMPRESS_PATH, compress_file) # ) # ) data = slice_read(os.path.join(BASE_PATH, 'eid去重保留最大grandTotal/eid去重保留最大grandTotal-3.xlsx')) big_join_file = pd.concat([big_join_file, data]) big_join_file.to_csv(cache, index=False) return big_join_file def step1() -> Tuple[pd.DataFrame, pd.DataFrame]: """处理大文件""" EID_PROED: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'EID_processed.csv') if not os.path.exists(EID_PROED): EID_PROED_CACHE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv') if os.path.exists(EID_PROED_CACHE): EID_PROED_CACHE: pd.DataFrame = data_process_tool.read_data(EID_PROED_CACHE) else: EID_PROED_CACHE: pd.DataFrame = step0() ISBNs = EID_PROED_CACHE['ISBN'].str.split('; ', expand=True) ISBNs = ISBNs.stack() # 把行转成列 ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 ISBNs.name = 'ISBN' EID_PROED: pd.DataFrame = EID_PROED_CACHE.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs) # 对表头进行重命名 new_columns = data_process_tool.rename_head(EID_PROED, postfix='-Other') EID_PROED.rename(columns=new_columns, inplace=True) # 缓存一下 EID_PROED.to_csv(os.path.join(BASE_PATH, 'EID_processed.csv'), index=False) else: EID_PROED = data_process_tool.read_data(EID_PROED) # 根据doi去重只保留一个用于doi匹配 DOI_PROED = EID_PROED.dropna(subset=['DOI-Other']) DOI_PROED.drop_duplicates(subset=['DOI-Other'], inplace=True) # 根据isbn去重保留一个用于剩下的做ISBN匹配 ISBN_PROED = EID_PROED.dropna(subset=['ISBN-Other']) ISBN_PROED.drop_duplicates(subset=['ISBN-Other'], inplace=True) return DOI_PROED, ISBN_PROED def step2(): BASE_FILE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, '副本BCR2022总表-20220729.xlsx') BASE_FILE = data_process_tool.read_data(BASE_FILE) doi_table, isbn_table = step1() doi_ = pd.merge(BASE_FILE, doi_table, how='left', left_on=['DOI'], right_on=['DOI-Other']) # 把doi分成有数据的和没数据的 has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据 no_data = doi_[doi_['DOI-Other'].isnull()] # 没有匹配到 del doi_ no_data = no_data[BASE_FILE.columns.values.tolist()] # 把没有数据的多余列去除 # 用没有匹配到doi的数据用isbn进行匹配 isbn_ = pd.merge(no_data, isbn_table, how='left', left_on=['ISBN.1'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了 # 合并doi匹配结果和isbn的结果 result_table = pd.concat([has_data, isbn_]) result_table.to_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'), index=False) def step3(): """to_excel""" df = pd.read_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv')) df.to_excel(os.path.join(BASE_PATH, 'BCR匹配结果.xlsx'), index=False) def main(): # step0() step2() # step3() if __name__ == '__main__': main() # slice_read()