data_process/bcr/合并小文件.py

# -*- coding: utf-8 -*-
# @Time    : 2024/2/1 17:10
# @Author  : zhaoxiangpeng
# @File    : 合并小文件.py

import os
import re
import warnings
import chardet
import pandas as pd
from loguru import logger


def read_standard(filename):
    table = pd.read_table(filename, encoding_errors='ignore', on_bad_lines='skip', low_memory=False)
    return table


def merge_files(path):
    big_table = pd.DataFrame()
    files = os.listdir(path)
    for file in files:
        file_path = os.path.join(path, file)
        table = read_standard(file_path)
        big_table = pd.concat([big_table, table])

    big_table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR", '2024BCR总表.csv'), sep='\t', index=False)


def read_file(file_path, encoding: str = 'gbk', error: bool = False):
    if not error:
        f = open(file_path, encoding=encoding)
    else:
        warnings.warn('%s 编码异常，启用检查' % file_path)
        check = open(file_path, 'rb')
        data = check.read()
        info = chardet.detect(data)
        encoding = info['encoding']
        kwargs = {}
        kwargs.update(encoding=encoding)
        warnings.warn('%s 尝试使用 "%s" 解码' % (file_path, encoding))
        f = open(file_path, **kwargs)
        code = encoding
    return f, encoding


def merge_files_by_row(path):
    """
    通过行读取的方式把小文件处理为标准的单个100,000条的文件
    """
    ERROR_FILE = ['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv']
    data_column_count = 35
    files = os.listdir(path)
    decode_table = dict()
    split_str = '\t'
    documents = []
    document_count = 0
    file_seq = 1
    for file in files:

        if file in ERROR_FILE:
            split_str = ','
            logger.warning("文件可能被修改过, 跳过 %s" % file)
            continue
        else:
            split_str = '\t'
        file_path = os.path.join(path, file)
        logger.info('处理 %s' % file_path)
        f, code = read_file(file_path)
        try:
            h = f.readline()
            head = h.strip('\n').split(split_str)
            logger.debug("表头长度: %s, %s" % (len(head), head))
        except UnicodeDecodeError:
            f, code = read_file(file_path, error=True)
            h = f.readline()
            head = h.strip('\n').split(split_str)
            logger.debug("表头长度: %s, %s" % (len(head), head))
        if '' in head:
            data_column_count = head.index('')
        if len(head) > data_column_count:
            head = head[:data_column_count]
        # print(head)
        while True:
            # line = None
            try:
                line = f.readline()
            except UnicodeDecodeError:
                logger.info('错误行: %s' % line)
                continue
            if not line:
                break
            data = line.strip('\n').split(split_str)
            documents.append(
                dict(zip(head, data[:data_column_count]))
            )
            document_count += 1

            if document_count >= 1e5:
                shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq)
                logger.info("数据条数到达 %s 保存一个文件: %s" % (document_count, shard))
                big_table = pd.DataFrame(documents)
                logger.info("配置 : %s %s" % big_table.shape)
                big_table.to_csv(shard, sep='\t', index=False)
                file_seq += 1
                documents = []
                document_count = 0
        f.close()

    shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq)
    logger.info("数据条数到达 %s 保存最后一片: %s" % (1e5, shard))
    big_table = pd.DataFrame(documents)
    big_table.to_csv(shard, sep='\t', index=False)

    logger.info("文件编码表: %s" % decode_table)


def merge_error_file(path, files: list):
    """合并小的文件"""
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, low_memory=False, encoding_errors='ignore', on_bad_lines='skip')
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    start = 0
    split = 100000
    row, col = big_table.shape
    file_idx = 101
    for x in range(start, row, split):
        table = big_table[x: x + split]
        table.to_csv(os.path.join('Y:\\zhaoxiangpeng\\2024BCR\\After', '%s.csv' % file_idx), index=False, sep='\t')
        file_idx += 1


def merge_standard_file(path):
    files = os.listdir(path)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, sep='\t', low_memory=False)
        big_table = pd.concat([big_table, small_table])
    row, col = big_table.shape
    split = 1000000
    file_idx = 1
    for x in range(0, row, split):
        table = big_table[x: x + split]
        table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\MergeFile", '%s.csv' % file_idx), index=False, sep='\t')
        file_idx += 1


def merge_small_file(path):
    files = os.listdir(path)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, index_col=False, low_memory=False, encoding_errors='ignore', on_bad_lines='skip')
        big_table = pd.concat([big_table, small_table])
    row, col = big_table.shape
    split = 800000
    file_idx = 1
    for x in range(0, row, split):
        table = big_table[x: x + split]
        table.to_excel(os.path.join("Y:\BCR\BCR202412\BCR2024书目补采API", '%s.xlsx' % file_idx), index=False)
        file_idx += 1


def find_eid_by_regex(text):
    res = re.search(r'2-s2\.0-\d+', text)
    if res:
        return res.group(0)
    return None


def batch_match(path):
    count = 0
    line_count = 0
    eid_collect = []
    writer = open('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\eid.csv', 'a+', encoding='utf-8')
    writer.write('EID'+'\n')
    file_list = os.listdir(path)
    for fname in file_list:
        file = os.path.join(path, fname)
        with open(file, encoding='utf-8') as f:
            while line := f.readline():
                line_count += 1
                eid = find_eid_by_regex(line)
                if not eid:
                    print(line)
                else:
                    count += 1
                    writer.write(eid + '\n')
    writer.close()
    print('总行数：%s\n匹配到：%s' % (line_count, count))


def func11():
    path = 'Y:\\zhaoxiangpeng\\BCR\\2025BCR'
    path2 = os.path.join(path, 'MergeFile')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        small_table = small_table[['EID']]
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    big_table.drop_duplicates(subset=['EID'], inplace=True)
    t2 = pd.read_csv(os.path.join(path, 'eid.csv'))
    t2.drop_duplicates(subset=['EID'], inplace=True)
    t2.rename(columns={'EID': "EID2"}, inplace=True)
    t0 = pd.merge(t2, big_table, how='left', left_on=['EID2'], right_on=['EID'])
    print(t0)
    t0[t0['EID'].isna()]['EID2'].to_csv(os.path.join(path, 'eid2.csv'), index=False)


if __name__ == '__main__':
    # merge_files_by_row("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据")
    # merge_error_file("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据",
    #                  files=['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv'])
    # merge_standard_file('Y:\\zhaoxiangpeng\\2024BCR\\After')
    merge_small_file(r'Y:\BCR\BCR202412\BCR2024书目补采API')
    # batch_match('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\API采集')
    # func11()