data_process/data_process_tool/file_read.py

# -*- coding: utf-8 -*-
# @Time    : 2022/8/5 8:58
# @Author  : ZAOXG
# @File    : file_read.py

import chardet
import pandas as pd
import warnings

__all__ = [
    'read_data'
]

file_type_operation = {
    'csv': pd.read_csv,
    'xlsx': pd.read_excel,
    'xls': pd.read_excel,
    'txt': pd.read_table,
    'xls2': pd.read_html
}


def read_data(file: str, **kwargs) -> pd.DataFrame:
    if '.' in file:
        file_type = file.rsplit('.')[-1]
    else:
        file_type = 'txt'
    file_type = file_type.lower()
    try:
        # if file_type == 'txt':
        #     kwargs.update(sep='\t')
        temp: pd.DataFrame = file_type_operation[file_type](file, **kwargs)
    except UnicodeDecodeError:
        warnings.warn('%s 编码异常，启用检查' % file)
        with open(file,  'rb') as f:
            data = f.read()
            info = chardet.detect(data)
            encoding = info['encoding']
            kwargs.update(encoding=encoding)
        warnings.warn('%s 尝试使用 "%s" 解码' % (file, encoding))
        temp = read_data(file, **kwargs)

    return temp