You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

72 lines
3.5 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2022/12/1 16:48
# @Author : ZAOXG
# @File : api记录匹配.py
import data_process_tool
import pandas as pd
def step1():
# 合并小文件
data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API采集-BCR2022相同记录\API采集', on_columns=['EID'], encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip')
data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2', on_columns=['EID'], encoding_errors='ignore', on_bad_lines='skip')
def step2():
record1 = data_process_tool.read_data(root_path+'2023记录.csv')
record1.drop_duplicates(inplace=True)
record1.rename(columns={'EID': '原始记录'}, inplace=True)
record2 = data_process_tool.read_data(root_path+'API采集.csv')
record2.drop_duplicates(inplace=True)
record3 = pd.merge(record2, record1, how='left', left_on=['EID'], right_on=['原始记录'])
print(record3)
error_record = record3[record3['EID'].isna()]
error_record.to_csv(root_path+'未匹配到记录2-EID.csv', index=False)
# error_record.to_excel(root_path+'未匹配到记录-EID.xlsx', index=False)
def step3():
record1 = data_process_tool.read_data(root_path + '2023原始记录.csv')
record1.drop_duplicates(subset=['EID'], inplace=True)
# 对每个表的列名做标记
record1_rename = {}
for r1_name in record1.columns:
record1_rename[r1_name] = r1_name + '(2023记录)'
record1.rename(columns=record1_rename, inplace=True)
record2 = data_process_tool.read_data(root_path + 'API下载记录.csv')
record2.drop_duplicates(subset=['EID'], inplace=True)
record2_rename = {}
for r2_name in record2.columns:
record2_rename[r2_name] = r2_name + '(API记录)'
record2.rename(columns=record2_rename, inplace=True)
# 左连接找出右表缺失字段
record3 = pd.merge(record2, record1, how='left', left_on=['EID(API记录)'], right_on=['EID(2023记录)'])
print(record3)
error_record = record3[record3['EID(2023记录)'].isna()]
error_record.to_excel(root_path + 'API下载记录有2023原始记录无.xlsx', index=False)
record4 = pd.merge(record1, record2, how='left', left_on=['EID(2023记录)'], right_on=['EID(API记录)'])
print(record4)
error_record2 = record4[record4['EID(API记录)'].isna()]
error_record2.to_excel(root_path + '2023原始记录有API下载记录无.xlsx', index=False)
if __name__ == '__main__':
root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
# step2()
data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API采集', to_type='csv',
encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip')
# data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录', encoding_errors='ignore',
# on_bad_lines='skip')
# data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录\2022-11-21-下载记录', encoding_errors='ignore',
# on_bad_lines='skip')
# data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API失败记录重新下载采集',
# to_type='csv',
# encoding='GB2312', encoding_errors='ignore',
# on_bad_lines='skip')
# step3()