data_process/bcr/BCR_20240311.py

# -*- coding: utf-8 -*-
# @Time    : 2024/3/11 13:48
# @Author  : zhaoxiangpeng
# @File    : BCR_20240311.py

import os
import pandas as pd
from loguru import logger

ADD_SOURCE = True
BASE_PATH = 'Y:\\BCR\\202407'


def load_all_small_file(path: str):
    """加载所有的小文件"""
    dirs = os.listdir(path)
    for dir_ in dirs:
        path1 = os.path.join(path, dir_)
        files = os.listdir(path1)
        for file in files:
            full_file_path = os.path.join(path1, file)
            yield full_file_path


def step0():
    gg = load_all_small_file(os.path.join(BASE_PATH, "API分工原始采集记录"))
    big_table = pd.DataFrame()
    for file_path in gg:
        logger.debug('当前处理 %s' % file_path)
        table = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, index_col=False)
        if ADD_SOURCE:
            simple_name = os.path.basename(file_path)
            simple_name = simple_name
            table['SOURCE'] = file_path
        # columns = table.columns.values.tolist()
        logger.debug('表头: %s' % table.columns.values.tolist())
        big_table = pd.concat([big_table, table])
    start = 0
    split = 1000000
    row, col = big_table.shape
    file_idx = 1
    for x in range(start, row, split):
        table = big_table[x: x + split]
        save_path = os.path.join(BASE_PATH, "After")
        table.to_csv(os.path.join(save_path, '%s.txt' % file_idx), sep='\t', index=False)
        table.to_excel(os.path.join(save_path, '%s.xlsx' % file_idx), index=False)
        file_idx += 1


if __name__ == '__main__':
    step0()