You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2024/3/11 13:48
|
|
# @Author : zhaoxiangpeng
|
|
# @File : BCR_20240311.py
|
|
|
|
import os
|
|
import pandas as pd
|
|
from loguru import logger
|
|
|
|
ADD_SOURCE = True
|
|
BASE_PATH = 'Y:\\BCR\\202407'
|
|
|
|
|
|
def load_all_small_file(path: str):
|
|
"""加载所有的小文件"""
|
|
dirs = os.listdir(path)
|
|
for dir_ in dirs:
|
|
path1 = os.path.join(path, dir_)
|
|
files = os.listdir(path1)
|
|
for file in files:
|
|
full_file_path = os.path.join(path1, file)
|
|
yield full_file_path
|
|
|
|
|
|
def step0():
|
|
gg = load_all_small_file(os.path.join(BASE_PATH, "API分工原始采集记录"))
|
|
big_table = pd.DataFrame()
|
|
for file_path in gg:
|
|
logger.debug('当前处理 %s' % file_path)
|
|
table = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, index_col=False)
|
|
if ADD_SOURCE:
|
|
simple_name = os.path.basename(file_path)
|
|
simple_name = simple_name
|
|
table['SOURCE'] = file_path
|
|
# columns = table.columns.values.tolist()
|
|
logger.debug('表头: %s' % table.columns.values.tolist())
|
|
big_table = pd.concat([big_table, table])
|
|
start = 0
|
|
split = 1000000
|
|
row, col = big_table.shape
|
|
file_idx = 1
|
|
for x in range(start, row, split):
|
|
table = big_table[x: x + split]
|
|
save_path = os.path.join(BASE_PATH, "After")
|
|
table.to_csv(os.path.join(save_path, '%s.txt' % file_idx), sep='\t', index=False)
|
|
table.to_excel(os.path.join(save_path, '%s.xlsx' % file_idx), index=False)
|
|
file_idx += 1
|
|
|
|
|
|
if __name__ == '__main__':
|
|
step0()
|
|
|
|
|