You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
7.4 MiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2023/5/24 9:36
# @Author : zhaoxiangpeng
# @File : record_20230524.py
# 2023-05-24 BCR处理
# 需求
# 根据doi、isbn补充eid去重表的字段优先doi其次isbn出现重复时只保留一个
# 实现
# 1.先把大文件(eid去重表)根据isbn和eid去重isbn是多个要先分开后再进行去重只保留一个
# 2.先对doi进行连接把没有匹配到的用isbn进行连接之后两次连接合并
# 缓存文件y盘有
import os
import openpyxl
from typing import Union, Tuple
import pandas as pd
import data_process_tool
BASE_PATH = 'F:/工作数据存储2023/20230517_一年一度的BCR/'
HEAD = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构',
'带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', '出版阶段',
'开放获取', '来源出版物', 'EID', 'Scopus ID', 'Title', 'Author', 'Author ID', 'Sort Year', '2020', '2021', '2022',
'GrandTotal', '访问类型']
def slice_read(path) -> pd.DataFrame:
# 打开Excel文件使用read_only模式
workbook = openpyxl.load_workbook(filename=path, read_only=True)
# 使用worksheet.iter_rows()方法读取单元格数据
# 获取第一个worksheet对象
worksheet = workbook.worksheets[0]
dataArray = []
# 遍历单元格并读取数据
for row in worksheet.iter_rows(max_col=40, values_only=True):
dataArray.append(row)
# 关闭Workbook对象
workbook.close()
table = pd.DataFrame(data=dataArray, columns=HEAD)
return table
def chunk_read():
pd.read_csv()
def step0() -> pd.DataFrame:
"""合并 eid去重保留最大grandTotal 压缩文件"""
cache = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv')
if os.path.exists(cache):
return data_process_tool.read_data(cache)
COMPRESS_PATH = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal-csv')
compress_files = os.listdir(COMPRESS_PATH)
big_join_file = pd.DataFrame()
compress_files.pop()
for compress_file in compress_files:
data = data_process_tool.read_data(
os.path.join(COMPRESS_PATH, compress_file),
# low_memory=False
)
big_join_file = pd.concat([big_join_file, data])
# big_join_file.append(
# data_process_tool.read_data(
# os.path.join(COMPRESS_PATH, compress_file)
# )
# )
data = slice_read(os.path.join(BASE_PATH, 'eid去重保留最大grandTotal/eid去重保留最大grandTotal-3.xlsx'))
big_join_file = pd.concat([big_join_file, data])
big_join_file.to_csv(cache, index=False)
return big_join_file
def step1() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""处理大文件"""
EID_PROED: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'EID_processed.csv')
if not os.path.exists(EID_PROED):
EID_PROED_CACHE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv')
if os.path.exists(EID_PROED_CACHE):
EID_PROED_CACHE: pd.DataFrame = data_process_tool.read_data(EID_PROED_CACHE)
else:
EID_PROED_CACHE: pd.DataFrame = step0()
ISBNs = EID_PROED_CACHE['ISBN'].str.split('; ', expand=True)
ISBNs = ISBNs.stack() # 把行转成列
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
ISBNs.name = 'ISBN'
EID_PROED: pd.DataFrame = EID_PROED_CACHE.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
# 对表头进行重命名
new_columns = data_process_tool.rename_head(EID_PROED, postfix='-Other')
EID_PROED.rename(columns=new_columns, inplace=True)
# 缓存一下
EID_PROED.to_csv(os.path.join(BASE_PATH, 'EID_processed.csv'), index=False)
else:
EID_PROED = data_process_tool.read_data(EID_PROED)
# 根据doi去重只保留一个用于doi匹配
DOI_PROED = EID_PROED.dropna(subset=['DOI-Other'])
DOI_PROED.drop_duplicates(subset=['DOI-Other'], inplace=True)
# 根据isbn去重保留一个用于剩下的做ISBN匹配
ISBN_PROED = EID_PROED.dropna(subset=['ISBN-Other'])
ISBN_PROED.drop_duplicates(subset=['ISBN-Other'], inplace=True)
return DOI_PROED, ISBN_PROED
def step2():
BASE_FILE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, '副本BCR2022总表-20220729.xlsx')
BASE_FILE = data_process_tool.read_data(BASE_FILE)
doi_table, isbn_table = step1()
doi_ = pd.merge(BASE_FILE, doi_table, how='left', left_on=['DOI'], right_on=['DOI-Other'])
# 把doi分成有数据的和没数据的
has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据
no_data = doi_[doi_['DOI-Other'].isnull()] # 没有匹配到
del doi_
no_data = no_data[BASE_FILE.columns.values.tolist()] # 把没有数据的多余列去除
# 用没有匹配到doi的数据用isbn进行匹配
isbn_ = pd.merge(no_data, isbn_table, how='left', left_on=['ISBN.1'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了
# 合并doi匹配结果和isbn的结果
result_table = pd.concat([has_data, isbn_])
result_table.to_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'), index=False)
def step3():
"""to_excel"""
df = pd.read_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'))
df.to_excel(os.path.join(BASE_PATH, 'BCR匹配结果.xlsx'), index=False)
def main():
# step0()
step2()
# step3()
if __name__ == '__main__':
main()
# slice_read()