# -*- coding: utf-8 -*- # @Time : 2024/3/11 13:48 # @Author : zhaoxiangpeng # @File : BCR_20240311.py import os import pandas as pd from loguru import logger ADD_SOURCE = True BASE_PATH = 'Y:\\BCR\\202407' def load_all_small_file(path: str): """加载所有的小文件""" dirs = os.listdir(path) for dir_ in dirs: path1 = os.path.join(path, dir_) files = os.listdir(path1) for file in files: full_file_path = os.path.join(path1, file) yield full_file_path def step0(): gg = load_all_small_file(os.path.join(BASE_PATH, "API分工原始采集记录")) big_table = pd.DataFrame() for file_path in gg: logger.debug('当前处理 %s' % file_path) table = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, index_col=False) if ADD_SOURCE: simple_name = os.path.basename(file_path) simple_name = simple_name table['SOURCE'] = file_path # columns = table.columns.values.tolist() logger.debug('表头: %s' % table.columns.values.tolist()) big_table = pd.concat([big_table, table]) start = 0 split = 1000000 row, col = big_table.shape file_idx = 1 for x in range(start, row, split): table = big_table[x: x + split] save_path = os.path.join(BASE_PATH, "After") table.to_csv(os.path.join(save_path, '%s.txt' % file_idx), sep='\t', index=False) table.to_excel(os.path.join(save_path, '%s.xlsx' % file_idx), index=False) file_idx += 1 if __name__ == '__main__': step0()