# -*- coding: utf-8 -*- # @Time : 2022/12/1 16:48 # @Author : ZAOXG # @File : api记录匹配.py import data_process_tool import pandas as pd def step1(): # 合并小文件 data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API采集-BCR2022相同记录\API采集', on_columns=['EID'], encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip') data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2', on_columns=['EID'], encoding_errors='ignore', on_bad_lines='skip') def step2(): record1 = data_process_tool.read_data(root_path+'2023记录.csv') record1.drop_duplicates(inplace=True) record1.rename(columns={'EID': '原始记录'}, inplace=True) record2 = data_process_tool.read_data(root_path+'API采集.csv') record2.drop_duplicates(inplace=True) record3 = pd.merge(record2, record1, how='left', left_on=['EID'], right_on=['原始记录']) print(record3) error_record = record3[record3['EID'].isna()] error_record.to_csv(root_path+'未匹配到记录2-EID.csv', index=False) # error_record.to_excel(root_path+'未匹配到记录-EID.xlsx', index=False) def step3(): record1 = data_process_tool.read_data(root_path + '2023原始记录.csv') record1.drop_duplicates(subset=['EID'], inplace=True) # 对每个表的列名做标记 record1_rename = {} for r1_name in record1.columns: record1_rename[r1_name] = r1_name + '(2023记录)' record1.rename(columns=record1_rename, inplace=True) record2 = data_process_tool.read_data(root_path + 'API下载记录.csv') record2.drop_duplicates(subset=['EID'], inplace=True) record2_rename = {} for r2_name in record2.columns: record2_rename[r2_name] = r2_name + '(API记录)' record2.rename(columns=record2_rename, inplace=True) # 左连接找出右表缺失字段 record3 = pd.merge(record2, record1, how='left', left_on=['EID(API记录)'], right_on=['EID(2023记录)']) print(record3) error_record = record3[record3['EID(2023记录)'].isna()] error_record.to_excel(root_path + 'API下载记录有2023原始记录无.xlsx', index=False) record4 = pd.merge(record1, record2, how='left', left_on=['EID(2023记录)'], right_on=['EID(API记录)']) print(record4) error_record2 = record4[record4['EID(API记录)'].isna()] error_record2.to_excel(root_path + '2023原始记录有API下载记录无.xlsx', index=False) if __name__ == '__main__': root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' # step2() data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API采集', to_type='csv', encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip') # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录', encoding_errors='ignore', # on_bad_lines='skip') # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录\2022-11-21-下载记录', encoding_errors='ignore', # on_bad_lines='skip') # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API失败记录重新下载采集', # to_type='csv', # encoding='GB2312', encoding_errors='ignore', # on_bad_lines='skip') # step3()