|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2023/5/24 9:36
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : record_20230524.py
|
|
|
# 2023-05-24 BCR处理
|
|
|
# 需求
|
|
|
# 根据doi、isbn补充eid去重表的字段;优先doi,其次isbn;出现重复时只保留一个
|
|
|
# 实现
|
|
|
# 1.先把大文件(eid去重表)根据isbn和eid去重,isbn是多个要先分开后再进行去重,只保留一个
|
|
|
# 2.先对doi进行连接,把没有匹配到的用isbn进行连接,之后两次连接合并
|
|
|
|
|
|
# 缓存文件y盘有
|
|
|
|
|
|
import os
|
|
|
import openpyxl
|
|
|
from typing import Union, Tuple
|
|
|
import pandas as pd
|
|
|
import data_process_tool
|
|
|
|
|
|
BASE_PATH = 'F:/工作数据存储2023/20230517_一年一度的BCR/'
|
|
|
HEAD = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构',
|
|
|
'带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', '出版阶段',
|
|
|
'开放获取', '来源出版物', 'EID', 'Scopus ID', 'Title', 'Author', 'Author ID', 'Sort Year', '2020', '2021', '2022',
|
|
|
'GrandTotal', '访问类型']
|
|
|
|
|
|
|
|
|
def slice_read(path) -> pd.DataFrame:
|
|
|
|
|
|
# 打开Excel文件,使用read_only模式
|
|
|
workbook = openpyxl.load_workbook(filename=path, read_only=True)
|
|
|
|
|
|
# 使用worksheet.iter_rows()方法读取单元格数据
|
|
|
# 获取第一个worksheet对象
|
|
|
worksheet = workbook.worksheets[0]
|
|
|
dataArray = []
|
|
|
# 遍历单元格并读取数据
|
|
|
for row in worksheet.iter_rows(max_col=40, values_only=True):
|
|
|
dataArray.append(row)
|
|
|
# 关闭Workbook对象
|
|
|
workbook.close()
|
|
|
|
|
|
table = pd.DataFrame(data=dataArray, columns=HEAD)
|
|
|
return table
|
|
|
|
|
|
|
|
|
def chunk_read():
|
|
|
pd.read_csv()
|
|
|
|
|
|
|
|
|
def step0() -> pd.DataFrame:
|
|
|
"""合并 eid去重保留最大grandTotal 压缩文件"""
|
|
|
cache = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv')
|
|
|
if os.path.exists(cache):
|
|
|
return data_process_tool.read_data(cache)
|
|
|
COMPRESS_PATH = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal-csv')
|
|
|
compress_files = os.listdir(COMPRESS_PATH)
|
|
|
big_join_file = pd.DataFrame()
|
|
|
compress_files.pop()
|
|
|
|
|
|
for compress_file in compress_files:
|
|
|
data = data_process_tool.read_data(
|
|
|
os.path.join(COMPRESS_PATH, compress_file),
|
|
|
# low_memory=False
|
|
|
)
|
|
|
big_join_file = pd.concat([big_join_file, data])
|
|
|
# big_join_file.append(
|
|
|
# data_process_tool.read_data(
|
|
|
# os.path.join(COMPRESS_PATH, compress_file)
|
|
|
# )
|
|
|
# )
|
|
|
data = slice_read(os.path.join(BASE_PATH, 'eid去重保留最大grandTotal/eid去重保留最大grandTotal-3.xlsx'))
|
|
|
big_join_file = pd.concat([big_join_file, data])
|
|
|
big_join_file.to_csv(cache, index=False)
|
|
|
return big_join_file
|
|
|
|
|
|
|
|
|
def step1() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
"""处理大文件"""
|
|
|
EID_PROED: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'EID_processed.csv')
|
|
|
if not os.path.exists(EID_PROED):
|
|
|
|
|
|
EID_PROED_CACHE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv')
|
|
|
if os.path.exists(EID_PROED_CACHE):
|
|
|
EID_PROED_CACHE: pd.DataFrame = data_process_tool.read_data(EID_PROED_CACHE)
|
|
|
else:
|
|
|
EID_PROED_CACHE: pd.DataFrame = step0()
|
|
|
ISBNs = EID_PROED_CACHE['ISBN'].str.split('; ', expand=True)
|
|
|
ISBNs = ISBNs.stack() # 把行转成列
|
|
|
ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引
|
|
|
ISBNs.name = 'ISBN'
|
|
|
EID_PROED: pd.DataFrame = EID_PROED_CACHE.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
|
|
|
# 对表头进行重命名
|
|
|
new_columns = data_process_tool.rename_head(EID_PROED, postfix='-Other')
|
|
|
EID_PROED.rename(columns=new_columns, inplace=True)
|
|
|
# 缓存一下
|
|
|
EID_PROED.to_csv(os.path.join(BASE_PATH, 'EID_processed.csv'), index=False)
|
|
|
else:
|
|
|
EID_PROED = data_process_tool.read_data(EID_PROED)
|
|
|
# 根据doi去重只保留一个用于doi匹配
|
|
|
DOI_PROED = EID_PROED.dropna(subset=['DOI-Other'])
|
|
|
DOI_PROED.drop_duplicates(subset=['DOI-Other'], inplace=True)
|
|
|
|
|
|
# 根据isbn去重保留一个用于剩下的做ISBN匹配
|
|
|
ISBN_PROED = EID_PROED.dropna(subset=['ISBN-Other'])
|
|
|
ISBN_PROED.drop_duplicates(subset=['ISBN-Other'], inplace=True)
|
|
|
|
|
|
return DOI_PROED, ISBN_PROED
|
|
|
|
|
|
|
|
|
def step2():
|
|
|
BASE_FILE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, '副本BCR2022总表-20220729.xlsx')
|
|
|
BASE_FILE = data_process_tool.read_data(BASE_FILE)
|
|
|
doi_table, isbn_table = step1()
|
|
|
doi_ = pd.merge(BASE_FILE, doi_table, how='left', left_on=['DOI'], right_on=['DOI-Other'])
|
|
|
# 把doi分成有数据的和没数据的
|
|
|
has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据
|
|
|
no_data = doi_[doi_['DOI-Other'].isnull()] # 没有匹配到
|
|
|
del doi_
|
|
|
no_data = no_data[BASE_FILE.columns.values.tolist()] # 把没有数据的多余列去除
|
|
|
# 用没有匹配到doi的数据用isbn进行匹配
|
|
|
isbn_ = pd.merge(no_data, isbn_table, how='left', left_on=['ISBN.1'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了
|
|
|
# 合并doi匹配结果和isbn的结果
|
|
|
result_table = pd.concat([has_data, isbn_])
|
|
|
result_table.to_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'), index=False)
|
|
|
|
|
|
|
|
|
def step3():
|
|
|
"""to_excel"""
|
|
|
df = pd.read_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'))
|
|
|
df.to_excel(os.path.join(BASE_PATH, 'BCR匹配结果.xlsx'), index=False)
|
|
|
|
|
|
|
|
|
def main():
|
|
|
# step0()
|
|
|
step2()
|
|
|
# step3()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
|
# slice_read()
|