代码上传到git

5 months ago · f245a2c520
parent d75d888b43
commit f245a2c520
35 changed files with 2898 additions and 0 deletions
--- a/article_subject/init.py
+++ b/article_subject/init.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/21 8:50
 # @Author  : ZhaoXiangPeng
 # @File    : __init__.py
--- a/article_subject/extract_score.py
+++ b/article_subject/extract_score.py
@ -0,0 +1,82 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2023/5/4 16:33
 # @Author  : zhaoxiangpeng
 # @File    : extract_score.py
 # 2023年6月6日09:52:59修改首先要对 进行聚合，分为多个表，计算分数
 import os
 import pandas as pd
 from typing import Union
 import data_process_tool
 from article_subject.utils import get_row_top_join_sub, get_row_top
 get_subject = get_row_top_join_sub
 class GroupScore:
    def __init__(self, by_column, base_columns: list = None):
        self.columns = base_columns
    def get_score(self, table_or_file: Union[pd.DataFrame, str]) -> pd.DataFrame:
        if isinstance(table_or_file, str):
            table_or_file = data_process_tool.read_data(table_or_file)
        assert isinstance(table_or_file, pd.DataFrame)
        groups = table_or_file.groupby(by=[''])
 class BaseExtractScore:
    def __init__(self, table: Union[pd.DataFrame, str], base_columns: list = None, score_columns: list = None):
        self._table = table
        self._columns = base_columns
    def process(self) -> pd.DataFrame:
        if isinstance(self._table, str):
            self._table = data_process_tool.read_data(self._table)
        # 把基本信息列当做行索引，计算分数值
        del_columns = set(self._table.columns) - set(self._columns)
        self._table.set_index(keys=self._columns, inplace=True)
        self._table['高分学科'] = self._table.apply(get_subject, axis=1)
        self._table.reset_index(inplace=True)
        self._table.drop(labels=list(del_columns), axis=1, inplace=True)
        return self._table
 def task1():
    PATH = 'F:/工作数据存储2023/20230426_评分模型/'
    file = os.path.join(PATH, '知识视界分类数据（已人工判断）_评分模型.csv')
    ins = BaseExtractScore(
        table=file,
        base_columns=['Title', 'ABS', 'title_en', 'Title_ABS', 'cncole', '教育部学科门类', '教育部一级学科', '教育部二级学科', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确', 'Unnamed: 13']
    )
    ta = ins.process()
    ta.to_csv(os.path.join(PATH, '知识视界分类数据（已人工判断）_评分模型-高分学科.csv'), index=False)
 def task2():
    PATH = 'F:/工作数据存储2023/20230426_评分模型/'
    file = os.path.join(PATH, 'jove-中图分类号数据_评分模型.csv')
    ins = BaseExtractScore(
        table=file,
        base_columns=['Genre - 655 indicatior 1 and 2 " 4"', 'Subjects - 650 indicator 1 and 2 " 4"', 'Additional Material Characteristics 006 “m     o  c        “', 'Physical Description Fixed Field - 007 “cr unu”', 'Video ID 001', 'Video Name 245 $a indicator 1 and 2 "00"', '856 $3 inidcatior 1 and 2 "40"', 'Link-856 $u', 'Physical Decription-300 $a', 'Format-', 'Runnng Time-', 'Subtitle/caption language codes (as ISO 3-letter codes separated by semicolons)', 'Chapter Number-', 'Formatted Contents: Chapter-505 $a', 'Content Type 336 “$btdi $2rdacontent”', 'Material Type 337 "$bc $2rdamedia”', 'Carrier Type - 338  “$bcr$2rdacarrier"', 'Summary - 520 $a', 'Source of Description Note-588 $a', 'Date-260 $c', 'Publisher-260 $b', 'City-260 $a', 'Country Code-008', 'Language-008', 'Series-490', 'ISSN-022 $a', 'cnCode', '教育部学科门类', '教育部一级学科', '教育部二级学科', '未能识别的clc', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确']
    )
    ta = ins.process()
    ta.to_csv(os.path.join(PATH, 'jove-中图分类号数据_评分模型-高分学科.csv'), index=False)
 def task3():
    PATH = 'Z:/数据处理流程/'
    file = os.path.join(PATH, '0028-4793_评分模型.csv')
    ins = BaseExtractScore(
        table=file,
        base_columns=['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT', 'cncode', '一级学科', '二级学科', '一级学科数目', '二级学科数目']
    )
    ta = ins.process()
    ta.to_csv(os.path.join(PATH, '0028-4793-高分学科.csv'), index=False)
 if __name__ == '__main__':
    # task1()
    # task2()
    task3()
--- a/article_subject/main.py
+++ b/article_subject/main.py
@ -0,0 +1,95 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/21 8:56
 # @Author  : ZhaoXiangPeng
 # @File    : main.py
 import pandas as pd
 from article_subject.utils import get_row_top, merge_table, get_today
 def func(row: pd.Series, num: int = 3):
    x, y, z = row['教育部一级学科'],row['HighScoreSubject'],row['HighScoreSubjectJournal']
    subjects = ';'.join([x, y, z]).split(';')
    # subjects = ''
    # subjects += x and x
    # subjects += ';' + y and y
    # subjects += ';' + z and z
    sub_count = {}
    for sub in subjects:
        if sub in sub_count:
            sub_count[sub] += 1
        else:
            sub_count[sub] = 1
    sub_list = []
    sub_sort = sorted(sub_count.items(), key=lambda x: -x[1])
    for sub in sub_sort:
        # 出现次数2次以上
        if len(sub_list) < 3 and sub[1] > 2:
            sub_list.append(sub[0])
    return ';'.join(sub_list)
 class Article2Subject:
    def __init__(self):
        self.periodical = None
    def periodical_top(self, filename):
        periodical = pd.read_csv(filename)
        periodical.set_index(['issn'], inplace=True)
        periodical['HighScoreSubjectJournal'] = periodical.apply(get_row_top, axis=1)
        periodical = periodical[['HighScoreSubjectJournal']]
        periodical.reset_index(inplace=True)
        self.periodical = periodical
        return periodical
    def tc_huang_subject(self, filename):
        tc_huang = pd.read_csv(filename)
        tc_huang = tc_huang[tc_huang['一级学科'].notnull()]
        tc_huang_list = []
        group = tc_huang.groupby(by=['issn'])
        for _, g in group:
            tc_huang_list.append({'issn': _, 'HighScoreSubjectJournal': ';'.join(g['一级学科'])})
        tc_huang = pd.DataFrame(tc_huang_list)
        self.tc_huang = tc_huang
        return tc_huang
    def join_journal_subject(self):
        journal_subject = pd.concat([self.periodical, self.tc_huang])
        journal_subject.drop_duplicates(subset=['issn'], keep='last', inplace=True)
        self.journal_subject = journal_subject
    def score_model(self, filename):
        base_columns = ['论文标题', '文献类型', '发表年份', '卷', '期', '发表月份', 'WOS核心合集被引频次', 'Scopus被引频次', '语言', '开始页数', '结束页数', 'WOSID', 'EID', '出版物名称', 'ISSN', '出版社', 'SCIE收录', 'SSCI收录', 'ESCI收录', 'A&HCI收录', 'EI收录', 'SCOPUS收录', 'snip值', 'sjr值', 'JCR收录', 'Q区间', 'IF值', 'JCR学科', 'ESI收录', 'ESI学科', '教育部门类', '教育部一级学科', '教育部二级学科', 'CSSCI收录', 'CSCD收录', 'AF', '作者列表', '是否第一作者', '第一作者机构', '第一作者', 'RP', '通讯作者地址', '是否通讯作者', '通讯作者机构', '通讯作者', 'C1', '作者地址', '本校机构信息', '本校学者排序', '本校学者信息', '摘要', 'sc']
        # del_columns = ['论文标题', '文献类型', '发表年份', '卷', '期', '发表月份', 'WOS核心合集被引频次', 'Scopus被引频次', '语言', '开始页数', '结束页数', 'WOSID', 'EID', '出版物名称', '出版社', 'SCIE收录', 'SSCI收录', 'ESCI收录', 'A&HCI收录', 'EI收录', 'SCOPUS收录', 'snip值', 'sjr值', 'JCR收录', 'Q区间', 'IF值', 'JCR学科', 'ESI收录', 'ESI学科', '教育部门类', '教育部一级学科', '教育部二级学科', 'CSSCI收录', 'CSCD收录', 'AF', '作者列表', '是否第一作者', '第一作者机构', '第一作者', 'RP', '通讯作者地址', '是否通讯作者', '通讯作者机构', '通讯作者', 'C1', '作者地址', '本校机构信息', '本校学者排序', '本校学者信息', '摘要', 'sc']
        score = pd.read_csv(filename)
        score_columns = ['军事学_军事后勤学与军事装备学', '军事学_军事思想及军事历史', '军事学_军事管理学', '军事学_军事训练学', '军事学_军队指挥学', '军事学_军队政治工作学', '军事学_战役学', '军事学_战术学', '军事学_战略学', '农学_作物学', '农学_兽医学', '农学_农业资源与环境', '农学_园艺学', '农学_林学', '农学_植物保护', '农学_水产', '农学_畜牧学', '农学_草学', '医学_中医学', '医学_中药学', '医学_中西医结合', '医学_临床医学', '医学_公共卫生与预防医学', '医学_医学技术', '医学_口腔医学', '医学_基础医学', '医学_护理学', '医学_特种医学', '医学_药学', '历史学_世界史', '历史学_中国史', '历史学_考古学', '哲学_哲学', '工学_交通运输工程', '工学_仪器科学与技术', '工学_信息与通信工程', '工学_光学工程', '工学_公安技术', '工学_兵器科学与技术', '工学_农业工程', '工学_冶金工程', '工学_力学', '工学_动力工程及工程热物理', '工学_化学工程与技术', '工学_土木工程', '工学_地质资源与地质工程', '工学_城乡规划学', '工学_安全科学与工程', '工学_建筑学', '工学_控制科学与工程', '工学_机械工程', '工学_材料科学与工程', '工学_林业工程', '工学_核科学与技术', '工学_水利工程', '工学_测绘科学与技术', '工学_环境科学与工程', '工学_生物医学工程', '工学_生物工程', '工学_电子科学与技术', '工学_电气工程', '工学_石油与天然气工程', '工学_矿业工程', '工学_纺织科学与工程', '工学_网络空间安全', '工学_航空宇航科学与技术', '工学_船舶与海洋工程', '工学_计算机科学与技术', '工学_软件工程', '工学_轻工技术与工程', '工学_风景园林学', '工学_食品科学与工程', '教育学_体育学', '教育学_心理学', '教育学_教育学', '文学_中国语言文学', '文学_外国语言文学', '文学_新闻传播学', '法学_公安学', '法学_政治学', '法学_民族学', '法学_法学', '法学_社会学', '法学_马克思主义理论', '理学_化学', '理学_地球物理学', '理学_地理学', '理学_地质学', '理学_大气科学', '理学_天文学', '理学_数学', '理学_海洋科学', '理学_物理学', '理学_生态学', '理学_生物学', '理学_科学技术史', '理学_系统科学', '理学_统计学', '管理学_公共管理', '管理学_农林经济管理', '管理学_图书情报与档案管理', '管理学_工商管理', '管理学_管理科学与工程', '经济学_应用经济学', '经济学_理论经济学', '艺术学_戏剧与影视学', '艺术学_美术学', '艺术学_艺术学理论', '艺术学_设计学', '艺术学_音乐与舞蹈学']
        # 计算分数列的最高分学科
        score['HighScoreSubject'] = score[score_columns].apply(get_row_top, axis=1)
        del_columns = ['军事学_军事后勤学与军事装备学', '军事学_军事思想及军事历史', '军事学_军事管理学', '军事学_军事训练学', '军事学_军队指挥学', '军事学_军队政治工作学', '军事学_战役学', '军事学_战术学', '军事学_战略学', '农学_作物学', '农学_兽医学', '农学_农业资源与环境', '农学_园艺学', '农学_林学', '农学_植物保护', '农学_水产', '农学_畜牧学', '农学_草学', '医学_中医学', '医学_中药学', '医学_中西医结合', '医学_临床医学', '医学_公共卫生与预防医学', '医学_医学技术', '医学_口腔医学', '医学_基础医学', '医学_护理学', '医学_特种医学', '医学_药学', '历史学_世界史', '历史学_中国史', '历史学_考古学', '哲学_哲学', '工学_交通运输工程', '工学_仪器科学与技术', '工学_信息与通信工程', '工学_光学工程', '工学_公安技术', '工学_兵器科学与技术', '工学_农业工程', '工学_冶金工程', '工学_力学', '工学_动力工程及工程热物理', '工学_化学工程与技术', '工学_土木工程', '工学_地质资源与地质工程', '工学_城乡规划学', '工学_安全科学与工程', '工学_建筑学', '工学_控制科学与工程', '工学_机械工程', '工学_材料科学与工程', '工学_林业工程', '工学_核科学与技术', '工学_水利工程', '工学_测绘科学与技术', '工学_环境科学与工程', '工学_生物医学工程', '工学_生物工程', '工学_电子科学与技术', '工学_电气工程', '工学_石油与天然气工程', '工学_矿业工程', '工学_纺织科学与工程', '工学_网络空间安全', '工学_航空宇航科学与技术', '工学_船舶与海洋工程', '工学_计算机科学与技术', '工学_软件工程', '工学_轻工技术与工程', '工学_风景园林学', '工学_食品科学与工程', '教育学_体育学', '教育学_心理学', '教育学_教育学', '文学_中国语言文学', '文学_外国语言文学', '文学_新闻传播学', '法学_公安学', '法学_政治学', '法学_民族学', '法学_法学', '法学_社会学', '法学_马克思主义理论', '理学_化学', '理学_地球物理学', '理学_地理学', '理学_地质学', '理学_大气科学', '理学_天文学', '理学_数学', '理学_海洋科学', '理学_物理学', '理学_生态学', '理学_生物学', '理学_科学技术史', '理学_系统科学', '理学_统计学', '管理学_公共管理', '管理学_农林经济管理', '管理学_图书情报与档案管理', '管理学_工商管理', '管理学_管理科学与工程', '经济学_应用经济学', '经济学_理论经济学', '艺术学_戏剧与影视学', '艺术学_美术学', '艺术学_艺术学理论', '艺术学_设计学', '艺术学_音乐与舞蹈学']
        # score.drop(del_columns, inplace=True)
        for c in del_columns:
            del score[c]
        # 处理基础数据
        hebing_df = self.base_add_journal(score)
        # hebing_df['LikeSubject'] = hebing_df['教育部一级学科']+hebing_df['HighScoreSubject']+hebing_df['HighScoreSubjectJournal']
        hebing_df['LikeSubject'] = hebing_df.apply(func, axis=1)
        # hebing_df.drop(['issn'], inplace=True)
        hebing_df.to_csv(f'./文章到学科{get_today()}.csv', index=False)
    def base_add_journal(self, base_table):
        base_table = base_table[base_table['教育部一级学科'].notnull()]
        new_df = pd.merge(left=base_table, right=self.journal_subject, how='left', left_on=['ISSN'], right_on=['issn'])
        return new_df[new_df['HighScoreSubjectJournal'].notnull()]
        # return new_df
    def execute(self):
        self.periodical_top('../SubjectData/getScore：step1.csv')
        self.tc_huang_subject('D:/Work/教育部学科数据2021年6月2日_processed.csv')
        self.join_journal_subject()
        self.score_model(f'./merge_table_{get_today()}.csv')
 if __name__ == '__main__':
    # merge_table('Z:/文章摘要推荐池/学科评分/toppaper核心数据/', './')
    a2s = Article2Subject()
    a2s.execute()
--- a/article_subject/utils.py
+++ b/article_subject/utils.py
@ -0,0 +1,56 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/21 8:53
 # @Author  : ZhaoXiangPeng
 # @File    : utils.py
 import pandas as pd
 import datetime
 import os
 def get_today(fmt='%Y%m%d'):
    return datetime.date.today().strftime(fmt)
 def get_row_top(row: pd.Series, num: int = 3):
    # print(row)
    top_sub_list = row.sort_values(ascending=False)[:num].index
    top_sub = []
    for sub in top_sub_list:
        top_sub.append(sub.split('_')[-1])
    return ';'.join(top_sub)
 def get_row_top_join_sub(row: pd.Series, num: int = 3, split: float = 0.9, split_on: bool = True):
    if split_on:
        new_row = row[row.values >= split]
        row_len = len(new_row)
        if row_len == 0:
            num = 1
        elif row_len < num:
            num = row_len
        else:
            num = num
    top_sub_list = row.sort_values(ascending=False)[:num].to_dict()
    top_sub = []
    for sub, score in top_sub_list.items():
        top_sub.append(f'{sub},{score}')
    return '; '.join(top_sub)
 def merge_table(filepath, output_path: str = None):
    """
    filepath: 输入文件路径
    output_path: 如果不为空,则文件保存到此目录
    """
    if filepath[-1] != '/':
        filepath += '/'
    file_list = os.listdir(filepath)
    return_df = pd.DataFrame()
    for filename in file_list:
        temp_df = pd.read_csv(filepath+filename)
        return_df = pd.concat([return_df, temp_df])
    if output_path:
        day = get_today()
        return_df.to_csv(output_path+f'/merge_table_{day}.csv', index=False)
    return return_df
--- a/bcr/BCR_20240201.py
+++ b/bcr/BCR_20240201.py
@ -0,0 +1,204 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/2/2 10:45
 # @Author  : zhaoxiangpeng
 # @File    : BCR_20240201.py
 import os
 from copy import deepcopy
 import pandas as pd
 import data_process_tool
 from bcr.utils import read_file, str2float, str2int
 from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
 # ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR"
 # ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403"
 # ROOT_PATH = "Y:\\BCR\\202407"
 C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN']
 def main():
    table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
    t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
    table = pd.concat([table, t2])
    table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
    # 把数量统计标准化
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    step2_table = step2(table, export=True)
    step3_table, no_data_table = step3(step2_table, export=True)
    step4(no_data_table)
 def process1(table: pd.DataFrame):
    TABLE2 = deepcopy(table)
    # 表头重命名
    # new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other')
    # TABLE2.rename(columns=new_columns, inplace=True)
    # 根据doi去重只保留一个用于doi匹配
    DOI_PROED = TABLE2.dropna(subset=['DOI'])
    DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True)
    # 把doi为空的删掉，没有doi的用isbn匹配
    ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()]
    ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True)
    return DOI_PROED, ISBN_PROED
 def process_func2(table: pd.DataFrame):
    """
    isbn分列
    """
    TABLE2 = deepcopy(table)
    TABLE2['ISBN'] = TABLE2['ISBN'].astype(str)  # 要转为str类型，不然会分不到
    ISBNs = TABLE2['ISBN'].str.split('; ', expand=True)
    ISBNs = ISBNs.stack()  # 把行转成列
    ISBNs = ISBNs.reset_index(level=1, drop=True)  # 重置索引, 并删除多余的索引
    ISBNs.name = 'ISBN'
    EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs)
    return EID_PROED
 def process_func3(export: bool = True):
    """
    合并两个scopus表
    """
    keep_columns = [
        'Title', 'Scopus ID',
        'Print ISBN', 'E-ISBN', 'Other ISBN',
        'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher',
        'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4',
    ]
    export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx")
    if not os.path.exists(export_file_path):
        table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx')
        table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx')
        table1 = pd.read_excel(table1_path, sheet_name=0)
        table1 = table1[keep_columns]
        table2 = pd.read_excel(table2_path, sheet_name=0)
        table2 = table2[keep_columns]
        table0 = pd.concat([table1, table2])
        table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True)
        table0['Scopus ID'] = table0['Scopus ID'].astype(str)
        if export:
            table0.to_excel(export_file_path, index=False)
    else:
        table0 = pd.read_excel(export_file_path, sheet_name=0)
    return table0
 def step2(table: pd.DataFrame, export: bool = True):
    """
    ppt第二个需求
    """
    group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum()
    group_by.reset_index(inplace=True)
    keep_columns = table[['DOI', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
    result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
    return result_table
 def step3(table: pd.DataFrame, export: bool = True):
    """
    ISBN合并记录与上一年BCR总表匹配
    按DOI、ISBN的顺序匹配，匹配到的记录，二个表的字段合并
    """
    BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx",
                              sheet_name=0)  # 上一年的BCR总表
    # 表头加标记
    new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other')
    BASE_FILE.rename(columns=new_columns, inplace=True)
    doi_table, isbn_table = process1(table)
    doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other'])
    """
    # 把doi分成有数据的和没数据的
    has_data = doi_[doi_['DOI-Other'].notnull()]  # 匹配到数据
    no_data = doi_[doi_['DOI-Other'].isnull()]  # 使用doi没有匹配到的
    # del doi_
    no_data = no_data[table.columns.values.tolist()]  # 把没有数据的多余列去除
    """
    # 用没有匹配到doi的数据用isbn进行匹配
    isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other'])  # 这些就不用考虑没有匹配到的了，因为没有剩下的条件了
    # 合并doi匹配结果和isbn的结果
    result_table = pd.concat([doi_, isbn_])
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False)
    # 通过doi和isbn都没有匹配到的
    all_no_data = result_table[result_table['ISBN-Other'].isnull()]
    all_no_data = all_no_data[table.columns.values.tolist()]  # 保留基础列
    if export:
        all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False)
    return result_table, all_no_data
 def step4(table: pd.DataFrame, export: bool = True):
    """
    ISBN合并记录与上一年BCR总表不匹配记录处理
    与SCOPUS来源书目匹配
        把二个表的ISBN分列，进行交叉匹配
        把二个表的字段进行合并
    再与OASIS记录匹配
        获取作者、学科分类数据
        删除敏感书目
    """
    df1 = process_func2(table)  # 不匹配记录
    df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True)
    df1['ISBN'] = df1['ISBN'].astype(str)
    print(df1)
    df2 = process_func3(export=export)
    for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']:
        df2[col] = df2[col].astype(str)
    c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left')
    c1_in = c1[c1['Print ISBN'].notnull()]
    c1_not = c1[c1['Print ISBN'].isnull()]
    c1_not = c1_not[C_COLUMNS]
    c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left')
    c2_in = c2[c2['E-ISBN'].notnull()]
    c2_not = c2[c2['E-ISBN'].isnull()]
    c2_not = c2_not[C_COLUMNS]
    c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left')
    c3_in = c3[c3['Other ISBN'].notnull()]
    c3_not = c3[c3['Other ISBN'].isnull()]
    # 3次匹配结果合并
    r1_in = pd.concat([c1_in, c2_in, c3_in])
    r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True)
    r1_not = c3_not
    r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False)
    r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']]
    r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True)
    if export:
        r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False)
        r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False)
 if __name__ == '__main__':
    main()
    """
    step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0)
    step3_table, no_data_table = step3(step2_table, export=True)
    step4(no_data_table)
    """
    # ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0)
    # step4(ste3_table)
--- a/bcr/BCR_20240311.py
+++ b/bcr/BCR_20240311.py
@ -0,0 +1,53 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/3/11 13:48
 # @Author  : zhaoxiangpeng
 # @File    : BCR_20240311.py
 import os
 import pandas as pd
 from loguru import logger
 ADD_SOURCE = True
 BASE_PATH = 'Y:\\BCR\\202407'
 def load_all_small_file(path: str):
    """加载所有的小文件"""
    dirs = os.listdir(path)
    for dir_ in dirs:
        path1 = os.path.join(path, dir_)
        files = os.listdir(path1)
        for file in files:
            full_file_path = os.path.join(path1, file)
            yield full_file_path
 def step0():
    gg = load_all_small_file(os.path.join(BASE_PATH, "API分工原始采集记录"))
    big_table = pd.DataFrame()
    for file_path in gg:
        logger.debug('当前处理 %s' % file_path)
        table = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, index_col=False)
        if ADD_SOURCE:
            simple_name = os.path.basename(file_path)
            simple_name = simple_name
            table['SOURCE'] = file_path
        # columns = table.columns.values.tolist()
        logger.debug('表头: %s' % table.columns.values.tolist())
        big_table = pd.concat([big_table, table])
    start = 0
    split = 1000000
    row, col = big_table.shape
    file_idx = 1
    for x in range(start, row, split):
        table = big_table[x: x + split]
        save_path = os.path.join(BASE_PATH, "After")
        table.to_csv(os.path.join(save_path, '%s.txt' % file_idx), sep='\t', index=False)
        table.to_excel(os.path.join(save_path, '%s.xlsx' % file_idx), index=False)
        file_idx += 1
 if __name__ == '__main__':
    step0()
--- a/bcr/BCR_20240426.py
+++ b/bcr/BCR_20240426.py
@ -0,0 +1,45 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/4/26 14:15
 # @Author  : zhaoxiangpeng
 # @File    : BCR_20240426.py
 import os
 import pandas as pd
 from loguru import logger
 ROOT_PATH = 'Y:\\zhaoxiangpeng\\BCR202404'
 def func1():
    """
    用22年的BCR填充24年错误的数据
    """
    bcr22_path = 'Y:\\zhaoxiangpeng\\BCR2022'
    bcr22_files = ['eid去重保留最大grandTotal-1.xlsx', 'eid去重保留最大grandTotal-2.xlsx',
                   'eid去重保留最大grandTotal-3.xlsx']
    # 分片的旧文件合并
    bcr22_table = pd.read_csv(os.path.join(bcr22_path, 'eid去重保留最大grandTotal.csv'))
    bcr22_table = bcr22_table[['EID', '2021']]
    bcr22_table.drop_duplicates(subset=['EID'], inplace=True)
    """
    for bcr22_file in bcr22_files:
        temp_file = pd.read_excel(os.path.join(bcr22_path, bcr22_file), engine='openpyxl', sheet_name=0)
        temp_file = temp_file[['EID', '2021']]
        bcr22_table = pd.concat([bcr22_table, temp_file])
    """
    # 24年补充的数据
    bcr24_extend = pd.read_csv(os.path.join(ROOT_PATH, 'Grand Total为空-20240410 11时06分下载.csv'), index_col=False)
    table_head = bcr24_extend.columns.values.tolist()
    new_table = pd.merge(bcr24_extend, bcr22_table, how='left', on=['EID'])
    new_table = new_table[['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码',
                           '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者',
                           '通讯地址',
                           '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言',
                           '来源出版物名称缩写', '文献类型', '出版阶段', '访问类型', '来源出版物', 'EID', 'Sort Year',
                           '2021', '2022', '2023', '2024', 'Grand Total']]
    print(new_table)
    new_table.to_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t', index=False)
 if __name__ == '__main__':
    func1()
--- a/bcr/BCR_20240724.py
+++ b/bcr/BCR_20240724.py
@ -0,0 +1,103 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/7/24 20:03
 # @Author  : zhaoxiangpeng
 # @File    : BCR_20240724.py
 import os
 import pandas as pd
 from bcr.utils import read_file, str2float, str2int
 from bcr.BCR_20240201 import step2, step3, step4
 from bcr.BCR_20240201 import main, ROOT_PATH
 from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
 c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID',
      'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total']
 def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True):
    """
    ppt第二个需求修改
    """
    # 2024/12/25 14:58 修改，增加了reduce_columns参数用来替换固定值
    if reduce_columns is None:
        reduce_columns = ['2021', '2022', '2023', 'Grand Total']
    if keep_columns is None:
        keep_columns = c2
    # 处理数值类型
    for col in reduce_columns:
        table[col] = table[col].apply(str2float)
    # 正常聚合
    # 1.求和结果
    agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum()
    agg_result.reset_index(inplace=True)  # 重置索引
    # 2.分块
    filter_table_is = table[table["文献类型"] == "Book"]
    filter_table_not = table[table["文献类型"] != "Book"]
    # 3.分别去重
    filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    # 4.合并去重保留是Book的，book的在上面，重复项保留上面的
    merge_table = pd.concat([filter_table_is, filter_table_not])
    merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True)
    # 5.删除多于列
    merge_table.drop(reduce_columns, axis=1, inplace=True)
    # 重新匹配
    result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN'])
    result_table = result[keep_columns]
    result['年份'] = result['年份'].astype(str)
    result['Sort Year'] = result['Sort Year'].astype(str)
    """
    # 新增的需求
    # 以ISBN聚合，重复项保留
    big_table = pd.DataFrame()
    group_by = table.groupby(by=['ISBN'])
    for _, group in group_by:
        agg: pd.Series = group[reduce_columns].sum()
        group_filter = group[group["文献类型"] == "Book"]
        if group_filter.empty:
            first = group[:1]
            # total求和
        else:
            first = group_filter[:1]
        # 替换聚合的值
        first[reduce_columns] = agg
        big_table = pd.concat([big_table, first])
    group_by.reset_index(inplace=True)
    """
    if export:
        result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False)
    return result_table
 def main_change():
    table = read_file(os.path.join(ROOT_PATH, 'MergeFile'))
    # 测试
    # table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t')
    t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t')
    table = pd.concat([table, t2])
    table.drop_duplicates(subset=['EID'], keep='last', inplace=True)
    # 把数量统计标准化
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    step2_table = step2_change(table, export=True)
    # step3_table, no_data_table = step3(step2_table, export=True)
    # step4(no_data_table)
 def change_field_type():
    table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl')
    table['年份'] = table['年份'].apply(str2int)
    table['Sort Year'] = table['Sort Year'].apply(str2int)
    table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False)
 if __name__ == '__main__':
    main_change()
    # change_field_type()
--- a/bcr/BCR_20241224.py
+++ b/bcr/BCR_20241224.py
@ -0,0 +1,111 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/12/24 15:03
 # @Author  : zhaoxiangpeng
 # @File    : BCR_20241224.py
 import os
 import re
 import warnings
 import chardet
 import pandas as pd
 from loguru import logger
 from bcr.utils import read_file, str2float, export_small_file
 import bcr.BCR_20240724 as bcr_20240724
 import bcr.BCR_20240201 as bcr_20240201
 from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH
 def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame:
    """
    补充失败的记录重新采集
    """
    extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0)
    if isinstance(base_table, pd.DataFrame):
        # 主表只保留eid用来对补数据的表进行去重
        dup_table = base_table[['EID']]
        dup_table.drop_duplicates(subset=['EID'], inplace=True)
        # eid列改名，防止有冲突
        dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True)
        # 扩展表的EID和主表的dup_eid列进行左连接，结果表dup_eid为空的的就是需要补充的行
        duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid'])
        duped_table = duped_table[duped_table['dup_eid'].isnull()]
        duped_table.drop(columns=['dup_eid'], inplace=True)
        # 删除用来匹配的列
        all_data_table = pd.concat([base_table, duped_table])
        return all_data_table
    return extend_table
 def step1_merge():
    path = 'Y:\\BCR\\2025BCR'
    path2 = os.path.join(path, 'MergeFile')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        # small_table = small_table[['EID']]
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv')
    big_table = pd.concat([big_table, small_table])
    return big_table
 def step1_merge_change():
    """
    处理补采的文件
    """
    path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        big_table = pd.concat([big_table, small_table])
    return task_change1(big_table)
 def step2_change(table: pd.DataFrame, export: bool = True):
    # 正常聚合
    # 1.求和结果
    # 求和前要先把数字类型给统一了
    table['2021'] = table['2021'].apply(str2float)
    table['2022'] = table['2022'].apply(str2float)
    table['2023'] = table['2023'].apply(str2float)
    table['Grand Total'] = table['Grand Total'].apply(str2float)
    # 把相同ISBN的记录合并成一条记录，多条记录的各年份和GrandTotal引用次数求和
    agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum()
    agg_result.reset_index(inplace=True)  # 重置索引
    # 2.分块
    filter_table_is = table[table["文献类型"] == "Book"]
    filter_table_not = table[table["文献类型"] != "Book"]
    filter_table_is[KEEP_COLUMNS]
 def main():
    STEP_IS_EXIST = True
    if STEP_IS_EXIST:
        table = step1_merge_change()
    # 判断表2的结果是否存在的逻辑
    step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx")
    if not os.path.exists(step_2_table_path):
        step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS,
                                                export=True)
    else:
        step2_table = pd.read_excel(step_2_table_path, sheet_name=0)
    # 第三步表结果是否存在的逻辑
    no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx')
    if not os.path.exists(no_data_table_path):
        step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True)
    else:
        no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0)
    # 处理第4步
    bcr_20240201.step4(no_data_table)
 if __name__ == '__main__':
    main()
--- a/bcr/init.py
+++ b/bcr/init.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/12/1 14:20
 # @Author  : ZAOXG
 # @File    : __init__.py.py
--- a/bcr/api记录匹配.py
+++ b/bcr/api记录匹配.py
@ -0,0 +1,71 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/12/1 16:48
 # @Author  : ZAOXG
 # @File    : api记录匹配.py
 import data_process_tool
 import pandas as pd
 def step1():
    # 合并小文件
    data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API采集-BCR2022相同记录\API采集', on_columns=['EID'], encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip')
    data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2', on_columns=['EID'], encoding_errors='ignore', on_bad_lines='skip')
 def step2():
    record1 = data_process_tool.read_data(root_path+'2023记录.csv')
    record1.drop_duplicates(inplace=True)
    record1.rename(columns={'EID': '原始记录'}, inplace=True)
    record2 = data_process_tool.read_data(root_path+'API采集.csv')
    record2.drop_duplicates(inplace=True)
    record3 = pd.merge(record2, record1, how='left', left_on=['EID'], right_on=['原始记录'])
    print(record3)
    error_record = record3[record3['EID'].isna()]
    error_record.to_csv(root_path+'未匹配到记录2-EID.csv', index=False)
    # error_record.to_excel(root_path+'未匹配到记录-EID.xlsx', index=False)
 def step3():
    record1 = data_process_tool.read_data(root_path + '2023原始记录.csv')
    record1.drop_duplicates(subset=['EID'], inplace=True)
    # 对每个表的列名做标记
    record1_rename = {}
    for r1_name in record1.columns:
        record1_rename[r1_name] = r1_name + '(2023记录)'
    record1.rename(columns=record1_rename, inplace=True)
    record2 = data_process_tool.read_data(root_path + 'API下载记录.csv')
    record2.drop_duplicates(subset=['EID'], inplace=True)
    record2_rename = {}
    for r2_name in record2.columns:
        record2_rename[r2_name] = r2_name + '(API记录)'
    record2.rename(columns=record2_rename, inplace=True)
    # 左连接找出右表缺失字段
    record3 = pd.merge(record2, record1, how='left', left_on=['EID(API记录)'], right_on=['EID(2023记录)'])
    print(record3)
    error_record = record3[record3['EID(2023记录)'].isna()]
    error_record.to_excel(root_path + 'API下载记录有2023原始记录无.xlsx', index=False)
    record4 = pd.merge(record1, record2, how='left', left_on=['EID(2023记录)'], right_on=['EID(API记录)'])
    print(record4)
    error_record2 = record4[record4['EID(API记录)'].isna()]
    error_record2.to_excel(root_path + '2023原始记录有API下载记录无.xlsx', index=False)
 if __name__ == '__main__':
    root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
    # step2()
    data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API采集', to_type='csv',
                                  encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip')
    # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录', encoding_errors='ignore',
    #                               on_bad_lines='skip')
    # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录\2022-11-21-下载记录', encoding_errors='ignore',
    #                               on_bad_lines='skip')
    # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API失败记录重新下载采集',
    #                               to_type='csv',
    #                               encoding='GB2312', encoding_errors='ignore',
    #                               on_bad_lines='skip')
    # step3()
--- a/bcr/bcr记录保留多列.py
+++ b/bcr/bcr记录保留多列.py
@ -0,0 +1,69 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2023/1/17 14:06
 # @Author  : zhaoxiangpeng
 # @File    : bcr记录保留多列.py
 import data_process_tool
 import pandas as pd
 def func1():
    """
    包留DOI, 来源出版物，来源出版物缩写
    """
    record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
    # 不为数字的列转为0
    # 删除有问题的行
    all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']]
    all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
    all_api_record['2020'].fillna(0, inplace=True)  # 把空行换为0
    all_api_record['2020'] = all_api_record['2020'].astype(float)  # 类型转为float
    group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
    group_by.reset_index(inplace=True)
    # group_by.to_csv('.....csv')  # 如果需要保存...
    # 取需要保留的列并去重只保留一个
    keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
    table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    print(table)
    # table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False)
    table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False)
 def func2():
    """
    将ISBN列分割为单个
    """
    record1 = data_process_tool.read_data(root_path + 'eid去重.csv')
    # 保留需要的列
    all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']]
    ISBNs = all_api_record['ISBN'].str.split('; ', expand=True)
    ISBNs = ISBNs.stack()  # 把行转成列
    ISBNs = ISBNs.reset_index(level=1, drop=True)  # 重置索引, 并删除多余的索引
    ISBNs.name = 'ISBN'
    all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs)
    # 也要处理一下有问题的哪行
    all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
    all_api_record['2020'].fillna(0, inplace=True)  # 把空行换为0
    all_api_record['2020'] = all_api_record['2020'].astype(float)  # 类型转为float
    # 分组
    group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
    group_by.reset_index(inplace=True)
    keep_columns = all_api_record[['DOI', 'ISBN']]
    keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True)
    table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN'])
    print(table)
    # table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False)
    table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False)
 if __name__ == '__main__':
    root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
    func1()
    func2()
--- a/bcr/bcr记录合并.py
+++ b/bcr/bcr记录合并.py
@ -0,0 +1,60 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/12/14 14:46
 # @Author  : zhaoxiangpeng
 # @File    : api.py
 import data_process_tool
 import pandas as pd
 import numpy as np
 import re
 def step1():
    """
    EID去重保留gratetotal更大值的行
    """
    record1 = data_process_tool.read_data(root_path + 'API下载记录.csv')
    record2 = data_process_tool.read_data(root_path + 'API失败记录重新下载采集.csv')
    record3 = data_process_tool.read_data(root_path + '2023原始记录.csv')
    # 失败记录 对比 api记录中缺失的列
    api_chaji = {'归属机构', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID',
                 '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
    # 失败记录 对比 原始记录中缺失的列
    raw_api_chaji = {'EID', '归属机构', 'Author full names', '文献标题', 'CODEN', '访问类型', '带归属机构的作者',
                     '来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'}
    print(record2)
    # 把 失败记录缺失的列在原始记录中补充
    record3 = record3[list(raw_api_chaji)]
    record3.rename(columns={'EID': 'EID_copy'}, inplace=True)
    temp_ = pd.merge(record2, record3, how='left', left_on=['EID'], right_on=['EID_copy'])
    # 只保留与api记录相同的列
    record2 = temp_[record1.columns.values.tolist()]
    # api记录与失败记录合并
    all_api_record = pd.concat([record1, record2])
    # 保留GrandTotal最大值 的EID，换个思路，把 GrandTotal 列排序，对EID去重保留最后一个
    all_api_record.sort_values(by=['GrandTotal'], inplace=True)
    all_api_record.drop_duplicates(subset=['EID'], keep='last', inplace=True)  # 去重后的文件
    # 不为数字的列转为0
    # 删除有问题的行
    all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True)
    all_api_record['2020'].fillna(0, inplace=True)  # 把空行换为0
    all_api_record['2020'] = all_api_record['2020'].astype(float)  # 类型转为float
    # all_api_record['2020'] = all_api_record['2020'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan)
    # 对ISBN
    group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum()
    group_by.to_excel(root_path + 'eid去重grandTotal合并.xlsx')
 def is_float(data):
    try:
        return float(data)
    except Exception:
        print(data)
        return 0
 if __name__ == '__main__':
    root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/'
    step1()
--- a/bcr/config.py
+++ b/bcr/config.py
@ -0,0 +1,8 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/12/26 8:57
 # @Author  : zhaoxiangpeng
 # @File    : config.py
 ROOT_PATH = r'Y:\BCR\BCR202412'  # 根路径，输入输出的路径
 KEEP_COLUMNS = ['作者', '作者 ID', '文献标题', '年份', '来源出版物名称', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', 'EID', 'Sort Year', '2022', '2023', '2024', 'Grand Total']  # 要保存的列
 REDUCE_COLUMNS = ['2022', '2023', '2024', 'Grand Total']
--- a/bcr/record_20230524.py
+++ b/bcr/record_20230524.py
--- a/bcr/test.py
+++ b/bcr/test.py
@ -0,0 +1,35 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/7/25 15:33
 # @Author  : zhaoxiangpeng
 # @File    : test.py
 import pandas as pd
 table = pd.DataFrame([
    {"A": 1, "B": 2, "C": 3, "D": 2, "T": "X"},
    {"A": 2, "B": 3, "C": 4, "D": 1, "T": "Y"},
    {"A": 3, "B": 4, "C": 1, "D": 2, "T": "X"},
    {"A": 4, "B": 1, "C": 2, "D": 3, "T": "Z"},
 ])
 print(table)
 print('-0'*50)
 big_table = pd.DataFrame()
 group_by = table.groupby(by=['T'])
 for gn, group in group_by:
    print('gn:', gn)
    print(group)
    a = group[["A", "B", "C", "D"]].sum()
    print(a)
    f = group[group['D'] == 2]
    if f.empty:
        first = group.head(1)
    else:
        first = f.head(1)
    first.loc[:, ("A", "B", "C", "D")] = a
    print(first)
    big_table = pd.concat([big_table, first])
    print('-0-' * 50)
 print(big_table)
--- a/bcr/utils.py
+++ b/bcr/utils.py
@ -0,0 +1,50 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/2/4 10:55
 # @Author  : zhaoxiangpeng
 # @File    : utils.py
 import os
 from typing import List, Union
 import pandas as pd
 def read_file(path_or_files: Union[List[str], str], path: bool = True):
    if path and isinstance(path_or_files, str):
        path_or_files = [os.path.join(path_or_files, file) for file in os.listdir(path_or_files)]
    big_table = pd.DataFrame()
    for file in path_or_files:
        table = pd.read_csv(file, sep='\t', low_memory=False)
        big_table = pd.concat([big_table, table])
    return big_table
 def export_small_file(big_table, export_path: str = None, split: int = int(8e5)):
    """
    大的表导出为小的表
    """
    row, col = big_table.shape
    file_idx = 1
    for x in range(0, row, split):
        table = big_table[x: x + split]
        table.to_excel(os.path.join(export_path, '%s.xlsx' % file_idx), index=False)
        file_idx += 1
 def str2float(string, replace=0):
    try:
        val = float(string)
    except ValueError:
        val = replace
    except TypeError:
        val = replace
    return val
 def str2int(string, replace=0):
    try:
        val = int(string)
    except ValueError:
        val = replace
    except TypeError:
        val = replace
    return val
--- a/bcr/合并小文件.py
+++ b/bcr/合并小文件.py
@ -0,0 +1,224 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/2/1 17:10
 # @Author  : zhaoxiangpeng
 # @File    : 合并小文件.py
 import os
 import re
 import warnings
 import chardet
 import pandas as pd
 from loguru import logger
 def read_standard(filename):
    table = pd.read_table(filename, encoding_errors='ignore', on_bad_lines='skip', low_memory=False)
    return table
 def merge_files(path):
    big_table = pd.DataFrame()
    files = os.listdir(path)
    for file in files:
        file_path = os.path.join(path, file)
        table = read_standard(file_path)
        big_table = pd.concat([big_table, table])
    big_table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR", '2024BCR总表.csv'), sep='\t', index=False)
 def read_file(file_path, encoding: str = 'gbk', error: bool = False):
    if not error:
        f = open(file_path, encoding=encoding)
    else:
        warnings.warn('%s 编码异常，启用检查' % file_path)
        check = open(file_path, 'rb')
        data = check.read()
        info = chardet.detect(data)
        encoding = info['encoding']
        kwargs = {}
        kwargs.update(encoding=encoding)
        warnings.warn('%s 尝试使用 "%s" 解码' % (file_path, encoding))
        f = open(file_path, **kwargs)
        code = encoding
    return f, encoding
 def merge_files_by_row(path):
    """
    通过行读取的方式把小文件处理为标准的单个100,000条的文件
    """
    ERROR_FILE = ['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv']
    data_column_count = 35
    files = os.listdir(path)
    decode_table = dict()
    split_str = '\t'
    documents = []
    document_count = 0
    file_seq = 1
    for file in files:
        if file in ERROR_FILE:
            split_str = ','
            logger.warning("文件可能被修改过, 跳过 %s" % file)
            continue
        else:
            split_str = '\t'
        file_path = os.path.join(path, file)
        logger.info('处理 %s' % file_path)
        f, code = read_file(file_path)
        try:
            h = f.readline()
            head = h.strip('\n').split(split_str)
            logger.debug("表头长度: %s, %s" % (len(head), head))
        except UnicodeDecodeError:
            f, code = read_file(file_path, error=True)
            h = f.readline()
            head = h.strip('\n').split(split_str)
            logger.debug("表头长度: %s, %s" % (len(head), head))
        if '' in head:
            data_column_count = head.index('')
        if len(head) > data_column_count:
            head = head[:data_column_count]
        # print(head)
        while True:
            # line = None
            try:
                line = f.readline()
            except UnicodeDecodeError:
                logger.info('错误行: %s' % line)
                continue
            if not line:
                break
            data = line.strip('\n').split(split_str)
            documents.append(
                dict(zip(head, data[:data_column_count]))
            )
            document_count += 1
            if document_count >= 1e5:
                shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq)
                logger.info("数据条数到达 %s 保存一个文件: %s" % (document_count, shard))
                big_table = pd.DataFrame(documents)
                logger.info("配置 : %s %s" % big_table.shape)
                big_table.to_csv(shard, sep='\t', index=False)
                file_seq += 1
                documents = []
                document_count = 0
        f.close()
    shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq)
    logger.info("数据条数到达 %s 保存最后一片: %s" % (1e5, shard))
    big_table = pd.DataFrame(documents)
    big_table.to_csv(shard, sep='\t', index=False)
    logger.info("文件编码表: %s" % decode_table)
 def merge_error_file(path, files: list):
    """合并小的文件"""
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, low_memory=False, encoding_errors='ignore', on_bad_lines='skip')
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    start = 0
    split = 100000
    row, col = big_table.shape
    file_idx = 101
    for x in range(start, row, split):
        table = big_table[x: x + split]
        table.to_csv(os.path.join('Y:\\zhaoxiangpeng\\2024BCR\\After', '%s.csv' % file_idx), index=False, sep='\t')
        file_idx += 1
 def merge_standard_file(path):
    files = os.listdir(path)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, sep='\t', low_memory=False)
        big_table = pd.concat([big_table, small_table])
    row, col = big_table.shape
    split = 1000000
    file_idx = 1
    for x in range(0, row, split):
        table = big_table[x: x + split]
        table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\MergeFile", '%s.csv' % file_idx), index=False, sep='\t')
        file_idx += 1
 def merge_small_file(path):
    files = os.listdir(path)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path, file)
        small_table = pd.read_csv(file_full_path, index_col=False, low_memory=False, encoding_errors='ignore', on_bad_lines='skip')
        big_table = pd.concat([big_table, small_table])
    row, col = big_table.shape
    split = 800000
    file_idx = 1
    for x in range(0, row, split):
        table = big_table[x: x + split]
        table.to_excel(os.path.join("Y:\BCR\BCR202412\BCR2024书目补采API", '%s.xlsx' % file_idx), index=False)
        file_idx += 1
 def find_eid_by_regex(text):
    res = re.search(r'2-s2\.0-\d+', text)
    if res:
        return res.group(0)
    return None
 def batch_match(path):
    count = 0
    line_count = 0
    eid_collect = []
    writer = open('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\eid.csv', 'a+', encoding='utf-8')
    writer.write('EID'+'\n')
    file_list = os.listdir(path)
    for fname in file_list:
        file = os.path.join(path, fname)
        with open(file, encoding='utf-8') as f:
            while line := f.readline():
                line_count += 1
                eid = find_eid_by_regex(line)
                if not eid:
                    print(line)
                else:
                    count += 1
                    writer.write(eid + '\n')
    writer.close()
    print('总行数：%s\n匹配到：%s' % (line_count, count))
 def func11():
    path = 'Y:\\zhaoxiangpeng\\BCR\\2025BCR'
    path2 = os.path.join(path, 'MergeFile')
    files = os.listdir(path2)
    big_table = pd.DataFrame()
    for file in files:
        file_full_path = os.path.join(path2, file)
        small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0)
        small_table = small_table[['EID']]
        print(small_table.shape)
        big_table = pd.concat([big_table, small_table])
    big_table.drop_duplicates(subset=['EID'], inplace=True)
    t2 = pd.read_csv(os.path.join(path, 'eid.csv'))
    t2.drop_duplicates(subset=['EID'], inplace=True)
    t2.rename(columns={'EID': "EID2"}, inplace=True)
    t0 = pd.merge(t2, big_table, how='left', left_on=['EID2'], right_on=['EID'])
    print(t0)
    t0[t0['EID'].isna()]['EID2'].to_csv(os.path.join(path, 'eid2.csv'), index=False)
 if __name__ == '__main__':
    # merge_files_by_row("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据")
    # merge_error_file("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据",
    #                  files=['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv'])
    # merge_standard_file('Y:\\zhaoxiangpeng\\2024BCR\\After')
    merge_small_file(r'Y:\BCR\BCR202412\BCR2024书目补采API')
    # batch_match('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\API采集')
    # func11()
--- a/bnu_wos/20250108_func.py
+++ b/bnu_wos/20250108_func.py
--- a/bnu_wos/init.py
+++ b/bnu_wos/init.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/1/8 14:28
 # @Author  : zhaoxiangpeng
 # @File    : __init__.py.py
--- a/data_process_tool/init.py
+++ b/data_process_tool/init.py
@ -0,0 +1,17 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/8/5 8:58
 # @Author  : ZAOXG
 # @File    : __init__.py
 from .file_read import read_data
 from .file_write import write_data
 from .merge_table import merge_table, get_dirs_file
 from .change_head import rename_head
 __all__ = [
    'read_data',
    'write_data',
    'merge_table',
    'get_dirs_file',
    'rename_head'
 ]
--- a/data_process_tool/change_head.py
+++ b/data_process_tool/change_head.py
@ -0,0 +1,16 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2023/5/24 15:23
 # @Author  : zhaoxiangpeng
 # @File    : change_head.py
 import pandas as pd
 from typing import Union
 def rename_head(table_or_head: Union[pd.DataFrame, list], postfix: str = '-other') -> dict:
    if isinstance(table_or_head, pd.DataFrame):
        table_or_head = table_or_head.columns.values.tolist()
    new_head = {}
    for head in table_or_head:
        new_head[head] = str(head) + postfix
    return new_head
--- a/data_process_tool/condition_filter.py
+++ b/data_process_tool/condition_filter.py
@ -0,0 +1,68 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/9/20 9:05
 # @Author  : ZAOXG
 # @File    : condition_filter.py
 import data_process_tool
 import pandas as pd
 def get_filter(text):
    text_list = text.split('; ')
    for t in text_list:
        t.index()
 def method1(table):
    table_name = None
    if isinstance(table, str):
        table_name = str(table)
        table = data_process_tool.read_data(table)
    # 只保留唯一id列 UT, 发文地址列 C1
    # 讲地址列以';'分割, 筛选符合条件的行, 获取符合条件的id
    # 原始表匹配符合条件的行, 保存匹配出结果的行
    new_table = table[['UT', 'C1']]
    new_table.rename(columns={'UT': 'UT-BP'}, inplace=True)
    locations = table['C1'].str.split('; ', expand=True)
    locations = locations.stack()
    locations = locations.reset_index(level=1, drop=True)
    locations.name = 'LOCATION'
    new_table = new_table.drop(['C1'], axis=1).join(locations)
    new_table['LOCATION'] = new_table['LOCATION'].str.lower()
    print(new_table)
    # univ petr|petr univ|unit petr|univj petr 条件由wos检索策略来的
    # 筛选华东
    hd: pd.DataFrame = new_table[(new_table['LOCATION'].str.contains('univ petr|petr univ|unit petr|univj petr')) & (new_table['LOCATION'].str.contains('shandong|qingdao|dongying'))]
    del hd['LOCATION']
    hd.drop_duplicates(subset=['UT-BP'], inplace=True)
    hd = pd.merge(table, hd, 'left', left_on=['UT'], right_on=['UT-BP'])
    hd = hd[hd['UT-BP'].notnull()]
    del hd['UT-BP']
    hd.to_excel(table_name.replace('.xlsx', '-华东.xlsx'), index=False)
    # 筛选北京
    bj: pd.DataFrame = new_table[(new_table['LOCATION'].str.contains('univ petr|petr univ|unit petr|univj petr')) & (new_table['LOCATION'].str.contains('beijing'))]
    del bj['LOCATION']
    bj.drop_duplicates(subset=['UT-BP'], inplace=True)
    bj = pd.merge(table, bj, 'left', left_on=['UT'], right_on=['UT-BP'])
    bj = bj[bj['UT-BP'].notnull()]
    del bj['UT-BP']
    bj.to_excel(table_name.replace('.xlsx', '-北京.xlsx'), index=False)
 def condition_filter(table, filters: list = None):
    table_name = None
    if isinstance(table, str):
        table_name = str(table)
        table = data_process_tool.read_data(table)
    table['C1_BP'] = table['C1'].str.lower()
    hd = table[(table['C1_BP'].str.contains('shandong')) | (table['C1_BP'].str.contains('qingdao')) | (table['C1_BP'].str.contains('dongying'))]
    del hd['C1_BP']
    hd.to_excel(table_name.replace('.xlsx', '华东.xlsx'), index=False)
    bj = table[(table['C1_BP'].str.contains('beijing'))]
    del bj['C1_BP']
    bj.to_excel(table_name.replace('.xlsx', '北京.xlsx'), index=False)
 if __name__ == '__main__':
    # condition_filter(r'F:\工作数据存储2022\20220919_中国石油大学wos\2020.xlsx')
    method1(r'Z:\客户数据存储\WOS\中国石油大学\OG=(China University of Petroleum) AND (PY==(2022))\OG=(China University of Petroleum) AND (PY==(2022)).xlsx')
--- a/data_process_tool/example.py
+++ b/data_process_tool/example.py
@ -0,0 +1,14 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/9/21 15:12
 # @Author  : ZAOXG
 # @File    : example.py
 import re
 import pandas as pd
 from data_process_tool import read_data
 def format_conversion(file_name: str):
    file_df = read_data(file_name)
    # file_df['wss'].str
    re.sub(r'<[A-Za-z]+>.*?</[A-Za-z]+>', '', )
--- a/data_process_tool/file_read.py
+++ b/data_process_tool/file_read.py
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/8/5 8:58
 # @Author  : ZAOXG
 # @File    : file_read.py
 import chardet
 import pandas as pd
 import warnings
 __all__ = [
    'read_data'
 ]
 file_type_operation = {
    'csv': pd.read_csv,
    'xlsx': pd.read_excel,
    'xls': pd.read_excel,
    'txt': pd.read_table,
    'xls2': pd.read_html
 }
 def read_data(file: str, **kwargs) -> pd.DataFrame:
    if '.' in file:
        file_type = file.rsplit('.')[-1]
    else:
        file_type = 'txt'
    file_type = file_type.lower()
    try:
        # if file_type == 'txt':
        #     kwargs.update(sep='\t')
        temp: pd.DataFrame = file_type_operation[file_type](file, **kwargs)
    except UnicodeDecodeError:
        warnings.warn('%s 编码异常，启用检查' % file)
        with open(file,  'rb') as f:
            data = f.read()
            info = chardet.detect(data)
            encoding = info['encoding']
            kwargs.update(encoding=encoding)
        warnings.warn('%s 尝试使用 "%s" 解码' % (file, encoding))
        temp = read_data(file, **kwargs)
    return temp
--- a/data_process_tool/file_write.py
+++ b/data_process_tool/file_write.py
@ -0,0 +1,23 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/11/1 14:26
 # @Author  : ZAOXG
 # @File    : file_write.py
 import pandas as pd
 __all__ = [
    'write_data'
 ]
 file_type_operation = {
    'csv': 'to_csv',
    'xlsx': 'to_excel',
    'xls': 'to_excel'
 }
 def write_data(data, file, index=True, **kwargs) -> pd.DataFrame:
    file_type = file.rsplit('.')[-1]
    temp: pd.DataFrame = getattr(data, file_type_operation[file_type])(file, index=index, **kwargs)
    return temp
--- a/data_process_tool/get_lose_year.py
+++ b/data_process_tool/get_lose_year.py
@ -0,0 +1,48 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2024/3/19 15:18
 # @Author  : zhaoxiangpeng
 # @File    : get_lose_year.py
 import os
 import re
 from loguru import logger
 ROOT_PATH = "Y:\wos-metadata\issn-data"
 collection = dict()
 has_year_collection = set()
 # 获取主路径下的所有ISSN文件夹
 master_dirs = os.listdir(ROOT_PATH)
 for master_dir in master_dirs:
    logger.debug('检测路径: %s' % master_dir)
    # 一个ISSN的主文件下理应是以数字命名的文件或以IS=(xx)命名的文件夹
    issn_file_path = os.path.join(ROOT_PATH, master_dir)
    child_dirs = os.listdir(issn_file_path)
    single_year_list = []
    for child_dir_name in child_dirs:
        # 判断是否是文件夹
        child_dir = os.path.join(issn_file_path, child_dir_name)
        if not os.path.isdir(child_dir):
            # logger.debug('检测 %s 不是一个文件夹, 跳过' % child_dir_name)
            continue
        else:
            # 如果是文件夹，获取文件夹中的年份信息
            results = re.findall(r'={1,2}\((\d{4})\)', child_dir_name)
            if results:
                single_year_list.append(int(results[0]))
                has_year_collection.add(master_dir)
    # 把单个年份的列表排序取最大最小值，计算应该有的年份数量，进行对比
    if not single_year_list:
        continue
    single_year_list = sorted(single_year_list)
    must_year_list = list(range(single_year_list[0], (single_year_list[-1]+1)))
    if len(single_year_list) < len(must_year_list):
        lose_year_list = set(must_year_list) - set(single_year_list)
        lose_year_list = list(lose_year_list)
        logger.warning("%s 有年份缺失, 缺失的年份有: %s" % (master_dir, list(lose_year_list)))
        collection[master_dir] = lose_year_list
 logger.warning('有年份的文件夹: %s' % has_year_collection)
 logger.warning('缺失年份的文件夹: %s' % collection)
--- a/data_process_tool/merge_table.py
+++ b/data_process_tool/merge_table.py
--- a/doi_parse/init.py
+++ b/doi_parse/init.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/5/31 14:38
 # @Author  : ZhaoXiangPeng
 # @File    : __init__.py
--- a/doi_parse/getkeys.py
+++ b/doi_parse/getkeys.py
@ -0,0 +1,48 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/2 14:02
 # @Author  : ZhaoXiangPeng
 # @File    : getkeys.py
 from ReSpider.db.redisdb import RedisDB
 import pandas as pd
 import json
 from typing import List, Dict
 class GetCount:
    def __init__(self, db: int = 1, write_path: str = None):
        self.client = RedisDB(db=db)
        self.write_path = write_path or 'E:/inspec/'
    def get_keys(self) -> List[bytes]:
        return self.client.keys(pattern='*-*')
    def get_kv(self, key) -> dict:
        return self.client.hgetall(key)
    @staticmethod
    def format(records: dict, key: str) -> list:
        temp = []
        for k, v in records.items():
            record: dict = {'doi': k.decode('utf-8'), 'count': int(v.decode('utf-8')), 'issn': key}
            temp.append(record)
        return temp
    def to_csv(self, data: list = None, file_name: str = None):
        df = pd.DataFrame(data)
        df.to_csv(self.write_path+file_name+'.csv', index=False)
    def aa(self):
        redis_keys: List[bytes] = self.get_keys()
        for redis_key in redis_keys:
            key_string: str = redis_key.decode('utf-8')
            kvs: dict = self.get_kv(key_string)
            print('*'*5 + key_string + '*'*5)
            key_records = self.format(kvs, key_string)
            self.to_csv(data=key_records, file_name=key_string)
 if __name__ == '__main__':
    gc = GetCount(db=2)
    # gc.get_keys()
    gc.aa()
--- a/doi_parse/gz.py
+++ b/doi_parse/gz.py
@ -0,0 +1,52 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/5/31 8:52
 # @Author  : ZhaoXiangPeng
 # @File    : gz.py
 import gzip
 import json
 import ujson
 def data_parse(io):
    data = json.load(io)
    items = data['items']
    for item in items:
        doi = item.get('DOI')
        dtype = item.get('type')
        issn = item.get('ISSN')
        title = item.get('title') and item.get('title')[0]
        source = item.get('container-title') and item.get('container-title')[0]
        print('****************************************************\n'
              'TITLE:    %s\n'
              'DOI:      %s\n'
              'TYPE:     %s\n'
              'ISSN:     %s\n'
              'SOURCE:   %s\n' % (title, doi, dtype, issn, source))
        if not item.get('reference-count', 0):
            continue
        try:
            reference_list = []
            for reference in item.get('reference', []):
                ref_doi = reference.get('DOI')
                if ref_doi:
                    # do something
                    reference_list.append(ref_doi)
                    continue
                ref_at = reference.get('article-title')
                if ref_at:
                    print(ref_at)
                    reference_list.append(ref_at)
                    continue
                ref_jt = reference.get('journal-title')
        except KeyError:
            print(item.keys())
 def un_gz(file_name):
    g_file = gzip.GzipFile(file_name)
    return g_file
 if __name__ == '__main__':
    un_gz('H:/crossref_public_data_file_2021_01/1.json.gz')
--- a/doi_parse/inspec2mongo.py
+++ b/doi_parse/inspec2mongo.py
@ -0,0 +1,159 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/8 16:54
 # @Author  : ZhaoXiangPeng
 # @File    : inspec2mongo.py
 from ReSpider.db.mongodb import AsyncMongoDB
 from ReSpider.extend.logger import LogMixin
 from doi_parse.inspec2redis import InspecToRedis
 import asyncio
 import os
 import json
 import time
 import logging
 DOI_LIST = {'10.19381/j.issn.1001-7585.2020.21.008', '10.14111/j.cnki.zgfx.2019.06.004', '10.13530/j.cnki.jlis.190047'}
 class AsyncBase(LogMixin):
    def __init__(self, task_list: list, limit: int = 6, loop=None):
        super().__init__()
        self.TASK_LIST: list = task_list
        self.loop = loop or asyncio.get_event_loop()
        self.limit = limit
    def add_callback(self, func):
        """
        添加任务处理方法
        """
        self.call_func = func
    async def next_task(self):
        semaphore = asyncio.Semaphore(value=self.limit)
        while True:
            try:
                task = self.TASK_LIST.pop()
                await semaphore.acquire()
                self.loop.create_task(
                    self.call_func(task, semaphore))
            except IndexError:
                await asyncio.sleep(3)
                if len(asyncio.all_tasks(loop=self.loop)) <= 1:
                    break
    def execute(self):
        self.logger.info('TASK INIT SUCCESS.')
        try:
            self.loop.run_until_complete(
                self.next_task()
            )
        except Exception as e:
            self.logger.error('execute %s' % e, exc_info=True)
        finally:
            self.loop.run_until_complete(
                self.loop.shutdown_asyncgens())
            self.loop.stop()
            self.logger.info('THE END.')
 class InspecToMongo(InspecToRedis):
    def __init__(self, db=None, file_list=None, root: str = None):
        super().__init__(file_list=file_list)
        self.db: AsyncMongoDB = db
        self.file_list: list = file_list
        print('初始化任务 %s 个' % len(file_list))
        self.root = root or 'H:/crossref_public_data_file_2021_01/'
    async def process(self, file=None, semaphore=None):
        if not file and self.file_list.__len__():
            file = self.get_next()
        if not file:
            return
        print('******************** 当前处理文件 %s ********************' % file)
        io = self.compatible(file)
        await self.data_parse(io, semaphore)
    async def data_parse(self, io, semaphore=None):
        data = json.load(io)
        io.close()
        items = data['items']
        for item in items:
            doi = item.get('DOI')
            doi_tag = False
            # dtype = item.get('type')
            # issn = item.get('ISSN')
            # title = item.get('title') and item.get('title')[0]
            # source = item.get('container-title') and item.get('container-title')[0]
            # publisher = item.get('publisher')
            ref_count = item.get('reference-count', 0)
            """
            print('****************************************************\n'
                  'TITLE:    %s\n'
                  'DOI:      %s\n'
                  'TYPE:     %s\n'
                  'ISSN:     %s\n'
                  'SOURCE:   %s\n' % (title, doi, dtype, issn, source))"""
            """
            # mongodb存一份
            ser_item = {'TITLE': title, 'DOI': doi, 'TYPE': dtype, 'PUBLISHER': publisher,
                        'REFERENCE-COUNT': ref_count,
                        'SOURCE': source, 'ISSN': issn}
            """
            if doi in DOI_LIST:
                doi_tag = True  # 如果doi为所需的doi，则所有参考文献都加入引用表, 且不太可能引用自己
            if not ref_count:
                continue
            try:
                reference_list = []
                for reference in item.get('reference', []):
                    ref_doi = reference.get('DOI')
                    if not ref_doi:
                        # 没有doi直接跳过
                        continue
                    # 有doi的逻辑
                    if doi_tag:
                        doi_in = {'doi': doi, 'ref_doi': ref_doi}
                        print(doi_in)
                        reference_list.append(doi_in)
                        continue
                    elif ref_doi in DOI_LIST:
                        ref_doi_in = {'doi': doi, 'ref_doi': ref_doi}
                        print(ref_doi_in)
                        reference_list.append(ref_doi_in)
                        continue
                    # ref_at = reference.get('article-title')
                    # if ref_at:
                    #     # print(ref_at)
                    #     # reference_list.append(ref_at)
                    #     continue
                    # ref_jt = reference.get('journal-title')
            except KeyError:
                print(item.keys())
            else:
                if not reference_list:
                    continue
                # print(reference_list[0])
                await self.db.add_batch('data_crossref_doirelation', reference_list)
        semaphore.release()  # 释放锁
    def start(self):
        aio_task = AsyncBase(self.file_list, limit=6)
        aio_task.add_callback(self.process)
        aio_task.execute()
 if __name__ == '__main__':
    mdb = AsyncMongoDB(host='127.0.0.1', port=27017, db='data_crossref')
    files = InspecToRedis.load_gz('H:/crossref_public_data_file_2021_01')
    files = files[20000:]
    # files = ['0.json']
    s = time.time()
    i2m = InspecToMongo(
        db=mdb,
        file_list=files
    )
    i2m.start()
    # i2r.inspec2redis()
    print('耗时 %s 秒' % (time.time() - s))
--- a/doi_parse/inspec2redis.py
+++ b/doi_parse/inspec2redis.py
@ -0,0 +1,174 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/5/31 10:02
 # @Author  : ZhaoXiangPeng
 # @File    : inspec2redis.py
 from ReSpider.db.redisdb import RedisDB
 from doi_parse.gz import un_gz
 from queue import Queue
 import pandas as pd
 import os
 import time
 import json
 from concurrent.futures import ThreadPoolExecutor
 class Counter:
    def __init__(self, clint=None):
        self.client: RedisDB = clint
        if clint is None:
            self.client = RedisDB(db=2)
    def incr(self, key: list, counter: list):
        """
        key: 可以作为刊唯一id的 redis key
        counter:  可以作为文章唯一id的 doi 或 title
        """
        pipe = self.client._redis.pipeline()
        pipe.multi()
        for k in key:
            k = k.upper()
            if not self.sismember('inspec:journals', k):
                continue
            for value in counter:
                pipe.hincrby(k, value)
        pipe.execute()
    def decr(self, key: str, counter: str, amount=1):
        """计数累减"""
        return self.client.hincrby(key, counter, amount=-amount)
    def get_cnt(self, key: str, counter: str):
        """获取当前计数的值"""
        return self.client.hget(key, counter)
    def sismember(self, key, value):
        return self.client.sismember(key, value)
 class InspecToRedis:
    """
    1. 加载文件
    2. 解压
    3. 解析
    4. 入库
    4.1 ISSN 关联
    4.2 ISSN1
            doi1 1
            doi2 1
        ISSN2
            doi1 2
            doi2 1
    """
    def __init__(self, counter=None, file_list=None, root: str = None):
        self.counter = counter  # 初始化计数器
        self.file_list: list = file_list
        print('初始化任务 %s 个' % len(file_list))
        # self.to_queue(file_list)
        self.root = root or 'H:/crossref_public_data_file_2021_01/'
    @staticmethod
    def load_gz(file_path):
        return os.listdir(file_path)
    # def to_queue(self, file_list):
    #     for file in file_list:
    #         self.file_list.put_nowait(file)
    #     print('*'*50, '注入队列完成', '*'*50)
    def inspec2redis(self):
        df = pd.read_csv('F:/工作数据存储2022/20220526_inspec测试/inspec期刊列表2.csv')
        issn_list = df['ISSN'].values.tolist()
        return self.counter.client.sadd('inspec:journals', issn_list)
    def get_next(self):
        item = self.file_list.pop()
        return item
    def compatible(self, file):
        if file[-2:] == 'gz':
            io = un_gz(self.root+file)
        else:
            io = open(self.root+file, encoding='utf-8')
        return io
    def data_parse(self, io):
        data = json.load(io)
        items = data['items']
        for item in items:
            doi = item.get('DOI')
            dtype = item.get('type')
            issn = item.get('ISSN')
            title = item.get('title') and item.get('title')[0]
            source = item.get('container-title') and item.get('container-title')[0]
            publisher = item.get('publisher')
            ref_count = item.get('reference-count', 0)
            """
            print('****************************************************\n'
                  'TITLE:    %s\n'
                  'DOI:      %s\n'
                  'TYPE:     %s\n'
                  'ISSN:     %s\n'
                  'SOURCE:   %s\n' % (title, doi, dtype, issn, source))"""
            """
            # mongodb存一份
            ser_item = {'TITLE': title, 'DOI': doi, 'TYPE': dtype, 'PUBLISHER': publisher,
                        'REFERENCE-COUNT': ref_count,
                        'SOURCE': source, 'ISSN': issn}
            """
            if not ref_count:
                continue
            try:
                reference_list = []
                for reference in item.get('reference', []):
                    ref_doi = reference.get('DOI')
                    if ref_doi:
                        # do something
                        # print(ref_doi)
                        reference_list.append(ref_doi)
                        continue
                    ref_at = reference.get('article-title')
                    if ref_at:
                        # print(ref_at)
                        # reference_list.append(ref_at)
                        continue
                    ref_jt = reference.get('journal-title')
            except KeyError:
                print(item.keys())
            else:
                self.counter.incr(issn, reference_list)
    def pro(self, file=None):
        # print('剩余任务 %s 个' % self.file_list.__len__())
        if not file and self.file_list.__len__():
            file = self.get_next()
        if not file:
            return
        print('******************** 当前处理文件 %s ********************' % file)
        io = self.compatible(file)
        self.data_parse(io)
        io.close()
    def batch(self):
        with ThreadPoolExecutor(max_workers=2) as executor:
            executor.map(self.pro, self.file_list)
    def start(self):
        index_count = 0
        while len(self.file_list):
            index_count += 1
            print('当前处理第 %s 个' % index_count)
            self.pro()
 if __name__ == '__main__':
    count = Counter()
    # files = InspecToRedis.load_gz('H:/crossref_public_data_file_2021_01')
    # files = files[40000:]
    files = ['0.json']
    i2r = InspecToRedis(counter=count, file_list=files)
    s = time.time()
    i2r.batch()
    # i2r.inspec2redis()
    print('耗时 %s 秒' % (time.time()-s))
--- a/doi_parse/inspec数据库施引测试数据.csv
+++ b/doi_parse/inspec数据库施引测试数据.csv
@ -0,0 +1,443 @@
 doi,count,issn
 10.1002/cite.201800135,3.0,0009-286X; 2058-9883
 10.1002/cnm.3180,2.0,1617-7959; 2267-1242
 10.1002/eng2.12217,1.0,1474-0346
 10.1002/geot.201800058,2.0,1865-7362
 10.1002/geot.201900076,2.0,0886-7798; 1865-7362
 10.1002/inst.12189,2.0,2156-485X; 0937-7255
 10.1002/inst.12279,1.0,1098-1241
 10.1002/j.2334-5837.2019.00598.x,2.0,2334-5837
 10.1002/lpor.202000254,1.0,1996-1944
 10.1002/sys.21503,1.0,1539-7734
 10.1007/978-3-030-04849-5_31,1.0,2078-2489
 10.1007/978-3-030-23162-0_4,1.0,1742-6596
 10.1007/978-3-030-23703-5_2,1.0,0020-7543
 10.1007/978-3-030-27878-6_5,1.0,2504-4494
 10.1007/978-3-030-29513-4_72,1.0,2076-3417
 10.1007/978-3-030-29852-4_30,1.0,1424-8220
 10.1007/978-3-030-30985-5_4,1.0,0925-9856
 10.1007/978-3-030-34644-7_40,1.0,2220-9964
 10.1007/978-3-030-34986-8_41,2.0,2161-3915
 10.1007/978-3-030-38077-9_75,1.0,0042-3114
 10.1007/978-3-030-42416-9_18,1.0,2071-1050
 10.1007/978-3-319-59153-7_13,1.0,0941-0643
 10.1007/978-3-319-65298-6_1,3.0,0020-7543; 1064-1246; 1068-798X
 10.1007/978-981-15-0802-8_199,1.0,2076-3417
 10.1007/s00170-011-3370-y,4.0,2234-7593; 0268-3768; 0043-1648
 10.1007/s00170-013-5155-y,10.0,2076-3417; 2071-1050; 0377-2217; 0925-5273; 0951-192X; 0018-7208; 1387-3954; 0268-3768; 1996-1073; 1751-7575
 10.1007/s00170-014-6091-1,3.0,0268-3768; 0263-2241; 1424-8220
 10.1007/s00170-018-1617-6,43.0,0888-3270; 2041-1723; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 0141-9331; 0890-0604; 2071-1050; 2227-9717; 0360-3199; 1050-0472; 1022-0038; 0020-7543; 0013-7944; 1530-9827; 1742-6596; 1729-8814; 0951-192X; 0929-6212; 1747-7778; 1617-9846; 1751-7575
 10.1007/s00170-018-2001-2,11.0,2076-3417; 2504-4494; 2227-9717; 1530-8669; 0951-192X; 2288-6206; 0268-3768; 1068-798X
 10.1007/s00170-018-2748-5,12.0,0360-5442; 0161-0457; 2076-3417; 2504-4494; 0925-5273; 0951-192X; 1022-0038; 0950-7051; 0268-3768
 10.1007/s00170-019-03794-z,4.0,0951-192X; 0268-3768; 0020-7543; 2234-7593
 10.1007/s00170-019-04653-7,8.0,0161-0457; 2504-4494; 2288-6206; 0268-3768; 0020-7543
 10.1007/s00170-019-04706-x,3.0,1050-0472; 2076-3417; 0268-3768
 10.1007/s00170-020-05056-9,4.0,2071-1050; 1996-1073; 0268-3768
 10.1007/s00170-020-05387-7,1.0,0268-3768
 10.1007/s00170-020-05567-5,1.0,1059-9495
 10.1007/s00170-020-05977-5,3.0,0268-3768; 2076-3417
 10.1007/s00231-016-1961-8,9.0,0145-8892; 1556-3758; 0177-0667; 0947-7411; 0145-8876; 0921-8831
 10.1007/s10257-018-0376-0,1.0,2504-4494
 10.1007/s10586-016-0618-1,2.0,1386-7857
 10.1007/s10586-018-2041-2,2.0,1367-5567; 0020-7543
 10.1007/s10845-017-1350-2,28.0,0360-5442; 2076-3417; 1862-4472; 1568-4946; 2071-1050; 1092-7026; 2472-5854; 0268-3768; 0951-192X; 0944-6524; 1024-123X; 1063-293X; 1687-8140; 1741-038X; 0925-2312; 2571-5577; 0020-7543; 1064-1246
 10.1007/s10845-019-01500-0,1.0,2504-4494
 10.1007/s10845-019-01512-w,4.0,2071-1050; 1076-2787; 2076-3417; 1996-1073
 10.1007/s10845-019-01516-6,2.0,2076-3417; 1996-1073
 10.1007/s11831-018-9301-4,15.0,1134-3060; 2504-4494; 2213-7467; 1996-1944; 2523-3963; 0029-5981; 1960-6206; 1438-7492; 0268-3768; 1996-1073; 2311-5521
 10.1007/s11837-017-2709-8,7.0,2214-8604; 1047-4838; 1424-8220; 0964-1726
 10.1007/s11837-020-04028-4,3.0,2214-8604; 1047-4838; 0264-1275
 10.1007/s12008-012-0165-9,13.0,0965-9978; 1530-9827; 0954-4062; 1955-2513; 2213-8463; 0268-3768; 0020-7543
 10.1007/s12008-013-0201-4,4.0,2261-236X; 0723-2632; 1996-1944; 1955-2513
 10.1007/s12008-016-0319-2,11.0,2076-3417; 1742-6596; 1955-2513; 0951-192X; 1868-5137; 1741-038X; 0954-4054; 1751-7575; 0020-7543
 10.1007/s12008-019-00578-3,1.0,1955-2513
 10.1007/s12008-019-00621-3,1.0,2227-9717
 10.1007/s12008-020-00694-5,1.0,1350-4533
 10.1007/s12161-013-9634-4,8.0,0149-6085; 0022-5142; 1541-4337; 0145-8876; 1936-9751
 10.1007/s12517-019-4574-y,1.0,1070-9622
 10.1007/s12599-019-00624-0,2.0,2076-3417; 1350-1917
 10.1007/s12652-018-0881-5,31.0,0888-3270; 1077-5463; 1424-8220; 1868-5137; 0268-3768; 0926-5805; 0096-3003; 1562-2479; 2071-1050; 1367-5788; 2227-9717; 2095-8099; 0950-7051; 1687-8140; 0020-7543; 2504-4494; 0013-7944; 1742-6596; 1729-8814; 0951-192X; 2213-8463; 0929-6212
 10.1007/s12652-018-0911-3,13.0,2076-3417; 2071-1050; 1742-6596; 0263-2241; 0951-192X; 2095-8099; 1350-6307; 0926-5805; 0268-3768; 0954-4054
 10.1007/s12652-018-0944-7,3.0,1678-5878; 1742-6596; 0954-4054
 10.1007/s12652-018-0946-5,15.0,2076-3417; 0954-4054; 2504-4494; 1729-8814; 0951-192X; 2073-8994; 1868-5137; 1350-6307; 0268-3768; 1678-5878; 0020-7543
 10.1007/s12652-018-0953-6,9.0,2504-4494; 0263-2241; 1729-8814; 0951-192X; 1868-5137; 1741-0398
 10.1007/s12652-018-1125-4,7.0,2504-4494; 0268-3768; 1868-5137; 1847-9790
 10.1007/s13349-020-00403-6,2.0,2190-5452; 1424-8220
 10.1007/s40430-020-02461-9,1.0,0268-3768
 10.1007/s40684-020-00196-5,2.0,2234-7593; 2076-3417
 10.1007/s40684-020-00227-1,3.0,2234-7593; 0737-8831; 0020-7543
 10.1007/s40860-018-0069-y,2.0,0929-6212; 1022-0038
 10.1016/j.apm.2019.09.036,4.0,0263-2241; 2470-0045; 0888-3270; 0307-904X
 10.1016/j.arcontrol.2019.01.001,2.0,2213-8463; 0020-7543
 10.1016/j.autcon.2019.102837,11.0,1134-3060; 2071-1050; 1093-9687; 1475-9217; 0926-5805; 1687-8086; 1545-2255
 10.1016/j.autcon.2019.102915,4.0,0926-5805; 2076-3417; 1474-0346; 2631-4428
 10.1016/j.autcon.2019.102930,9.0,2076-3417; 0969-9988; 0268-4012; 2071-1050; 1471-4175; 1350-6307; 0926-5805; 0020-7543
 10.1016/j.autcon.2020.103179,12.0,2076-3417; 1618-954X; 2071-1050; 1424-8220; 1474-0346; 0926-5805; 1687-8086
 10.1016/j.autcon.2020.103183,2.0,2071-1050; 1687-8086
 10.1016/j.autcon.2020.103277,1.0,2076-3417
 10.1016/j.cad.2011.07.007,6.0,0965-9978; 0951-192X; 0944-6524; 1745-2759; 0094-2405
 10.1016/j.cherd.2012.08.004,28.0,0959-3330; 1932-2135; 0306-2619; 0145-8892; 1618-954X; 1385-8947; 1568-4946; 1556-7036; 1876-1070; 2077-0375; 0376-7388; 0360-3199; 0926-860X; 0167-7322; 0930-7516; 0022-1481; 0920-4105
 10.1016/j.cirp.2017.04.038,53.0,0954-4062; 0944-6524; 1868-5137; 0268-3768; 2075-1702; 2076-3417; 1478-0771; 2332-9017; 1474-0346; 0965-9978; 0161-0457; 0890-0604; 2071-1050; 1367-5788; 1087-1357; 1050-0472; 0935-1175; 2095-8099; 0020-7543; 0013-7944; 1530-9827; 0263-2241; 2267-1242; 0951-192X; 1751-7575
 10.1016/j.cirp.2017.04.040,80.0,1098-1241; 0378-7788; 1996-1944; 1424-8220; 1955-2513; 0029-5981; 0944-6524; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 1999-5903; 2332-9017; 2053-4701; 1474-0346; 2516-8398; 0926-5805; 2227-7080; 0735-3766; 1064-1246; 0268-4012; 0094-114X; 0890-0604; 2071-1050; 1367-5788; 1087-1357; 2227-9717; 1050-0472; 2095-8099; 1741-038X; 0010-4485; 0954-4054; 0020-7543; 1468-8115; 1530-9827; 1742-6596; 2523-3963; 0951-192X; 2234-7593; 0895-6308; 0040-5175; 2261-236X; 1751-7575
 10.1016/j.cirp.2018.04.039,3.0,0951-192X; 1474-0346; 1087-1357
 10.1016/j.cirp.2018.04.055,34.0,1098-1241; 1532-0626; 1868-5137; 2288-6206; 1863-8880; 0268-3768; 2076-3417; 0742-4795; 2332-9017; 1474-0346; 2071-1050; 1367-5788; 1087-1357; 0360-3199; 1050-0472; 2095-8099; 2391-5439; 0954-4054; 0020-7543; 2073-8994; 2504-4494; 0013-7944; 1742-6596; 1729-8814; 0951-192X
 10.1016/j.cirp.2018.04.118,2.0,2076-3417; 1063-293X
 10.1016/j.cirp.2019.04.011,6.0,0951-192X; 0161-0457; 2076-3417; 0954-4054
 10.1016/j.cirp.2019.04.024,5.0,2076-3417; 0954-4054; 2227-9717; 0020-7543
 10.1016/j.cirp.2019.04.041,1.0,1350-6307
 10.1016/j.cirp.2019.05.010,1.0,2071-1050
 10.1016/j.cirpj.2019.04.007,2.0,0360-5442; 1747-7778
 10.1016/j.cma.2020.112907,2.0,0010-4485; 0178-7675
 10.1016/j.cmpb.2019.01.003,2.0,1617-7959; 1025-5842
 10.1016/j.cobme.2018.04.001,5.0,2296-4185; 0935-9648; 1742-7061; 2054-5703
 10.1016/j.compag.2016.03.005,14.0,0950-5423; 0145-8892; 1866-7910; 2071-1050; 0038-7010; 0022-1155; 2072-4292; 0022-5142; 1537-5110; 2314-4920; 1936-9751
 10.1016/j.compchemeng.2018.09.022,3.0,1735-1472; 1064-1246; 1996-1073
 10.1016/j.compchemeng.2019.106577,1.0,0009-286X
 10.1016/j.compind.2019.103130,11.0,2076-3417; 2071-1050; 0263-2241; 1424-8220; 0268-3768; 0020-7543
 10.1016/j.compstruc.2020.106282,3.0,1424-8220
 10.1016/j.ejor.2018.04.032,6.0,2071-1050; 0377-2217; 1868-5137; 1747-7778; 0160-5682; 0920-8542
 10.1016/j.enbuild.2019.07.015,1.0,1940-1493
 10.1016/j.eng.2019.01.014,26.0,1134-3060; 2076-3417; 0009-286X; 1866-7910; 2504-4494; 1999-5903; 2071-1050; 2640-4567; 1424-8220; 2214-7853; 2227-9717; 2523-3963; 2095-8099; 1474-0346; 1868-5137; 2516-8398; 0737-3937; 0954-4054
 10.1016/j.engfracmech.2019.106673,2.0,2214-8604; 2193-9764
 10.1016/j.engfracmech.2019.106674,3.0,1996-1944; 2267-1242; 0307-904X
 10.1016/j.engfracmech.2019.106766,3.0,1099-4300; 1996-1944; 2571-631X
 10.1016/j.engfracmech.2020.107075,2.0,2071-1050; 0013-7944
 10.1016/j.ergon.2019.02.001,13.0,2076-3417; 1022-1360; 2071-1050; 0263-2241; 1424-8220; 2523-3963; 1955-2513; 1080-3548; 2169-3277; 0268-3768
 10.1016/j.fusengdes.2017.10.012,6.0,2332-9017; 0029-5515; 0268-3768; 0020-7543; 2571-631X
 10.1016/j.future.2019.12.020,6.0,2076-3417; 1424-8220; 0929-6212; 1741-0398; 2073-8994
 10.1016/j.ifacol.2015.06.141,77.0,1434-5021; 2578-0727; 1424-8220; 0944-6524; 1532-0626; 1868-5137; 1350-6307; 1076-2787; 0268-3768; 2076-3417; 1092-7026; 2332-9017; 1474-0346; 2227-7080; 1064-1246; 0306-2619; 0268-4012; 0890-0604; 2071-1050; 2156-485X; 1367-5788; 1087-1357; 2227-9717; 2296-4185; 2095-8099; 1099-4300; 1246-0125; 2524-8510; 0954-4054; 0020-7543; 2073-8994; 0177-0667; 0952-1976; 2220-9964; 2267-1242; 1729-8814; 2214-7853; 0951-192X; 2169-3277; 2213-8463; 1617-9846
 10.1016/j.ifacol.2016.11.115,30.0,1996-1944; 2363-7005; 1868-5137; 2288-6206; 0268-3768; 2332-9017; 1474-0346; 2071-1050; 2095-8099; 0010-485X; 1099-4300; 0954-4054; 0020-7543; 2073-8994; 0360-5442; 1742-6596; 2214-7853; 0951-192X; 1751-7575
 10.1016/j.ifacol.2017.08.2360,2.0,0268-3768
 10.1016/j.ifacol.2017.08.902,4.0,0951-192X; 2075-1702; 1042-9247; 0360-5442
 10.1016/j.ifacol.2018.03.104,3.0,2504-4494; 0268-3768; 1678-5878
 10.1016/j.ifacol.2018.06.356,4.0,0920-4105; 1687-8140
 10.1016/j.ifacol.2019.10.024,1.0,0957-0233
 10.1016/j.ifacol.2019.11.383,3.0,0953-7287; 0020-7543; 0268-4012
 10.1016/j.ifacol.2019.11.536,1.0,0953-7287
 10.1016/j.ifacol.2019.11.685,2.0,0178-2312
 10.1016/j.ijinfomgt.2019.05.020,6.0,0926-5805; 2640-4567; 0268-4012; 0360-3199
 10.1016/j.ijmst.2015.09.016,1.0,2071-1050
 10.1016/j.infrared.2015.06.002,5.0,2076-3417; 2071-1050; 0263-2241; 0268-3768; 0948-7921
 10.1016/j.infrared.2019.04.007,4.0,0703-8992; 2076-3417; 0145-8892; 1936-9751
 10.1016/j.isatra.2019.05.011,3.0,0268-3768; 0263-2241; 2227-9717
 10.1016/j.jclepro.2019.04.156,10.0,2412-3811; 2071-1050; 2076-3417; 1424-8220
 10.1016/j.jclepro.2019.119299,2.0,0360-5442; 0890-0604
 10.1016/j.jclepro.2019.119423,4.0,0951-192X; 2071-1050; 1741-0398; 0360-5442
 10.1016/j.jmapro.2013.10.004,10.0,1042-6914; 0043-1648; 1091-0344; 2214-7853; 1876-990X; 0268-3768; 0890-6955; 1678-5878
 10.1016/j.jmatprotec.2013.03.013,57.0,0884-2914; 1438-1656; 1996-1944; 2288-6206; 1350-6307; 0268-3768; 2159-6867; 0264-1275; 1464-4207; 2073-4360; 0925-5273; 2352-4928; 2227-7080; 0928-4931; 2214-8604; 1355-2546; 1359-6454; 1087-1357; 0167-6636; 1073-5623; 1073-5615; 0263-2241; 0734-743X; 0020-7403; 0921-5093; 2075-4701
 10.1016/j.jmsy.2018.02.002,21.0,2076-3417; 2224-2708; 2071-1050; 0263-2241; 1729-8814; 0885-8624; 0951-192X; 2213-8463; 2516-8398; 1741-038X; 0268-3768; 0954-4054; 1064-1246; 0307-904X; 0020-7543
 10.1016/j.jmsy.2018.05.003,25.0,2076-3417; 0890-0604; 1530-9827; 1424-8220; 0268-3768; 1386-7857; 0885-8624; 2234-7593; 2190-7188; 1868-5137; 1463-5771; 1741-038X; 1463-7154; 0020-7543
 10.1016/j.jmsy.2020.04.005,1.0,1996-1073
 10.1016/j.knosys.2014.03.010,4.0,0952-1976; 0950-7051
 10.1016/j.measurement.2011.09.018,2.0,0957-0233; 0263-2241
 10.1016/j.medengphy.2019.08.007,5.0,1350-4533; 2311-5521; 1364-503X; 2040-7939
 10.1016/j.mfglet.2018.02.006,28.0,0306-2619; 2050-7038; 2075-1702; 2079-9292; 0890-0604; 1093-9687; 1742-6596; 1424-8220; 2227-9717; 0951-192X; 0178-2312; 2213-8463; 1350-6307; 0926-5805; 0268-3768; 0007-6813; 0020-7543; 0307-904X
 10.1016/j.mfglet.2020.04.004,4.0,2214-8604; 1474-0346; 1996-1073
 10.1016/j.molliq.2016.12.028,11.0,1350-4177; 0041-624X; 0956-053X; 1872-5805; 1615-9306; 0167-7322; 2297-8747; 2398-4902
 10.1016/j.net.2020.03.028,1.0,1076-2787
 10.1016/j.optlastec.2010.07.010,5.0,0732-8818; 1568-4946; 0263-2241; 2523-3963; 2196-7229
 10.1016/j.procir.2016.11.152,68.0,1866-7910; 1574-017X; 1424-8220; 2363-7005; 0944-6524; 1868-5137; 0268-3768; 2075-1702; 2076-3417; 2332-9017; 0141-9331; 1474-0346; 2227-7080; 1996-1073; 0268-4012; 2071-1050; 1367-5788; 2288-5048; 2095-8099; 1687-8140; 1741-038X; 2095-0233; 2391-5439; 0954-4054; 0020-7543; 2224-2708; 0952-1976; 0013-7944; 1742-6596; 2504-4494; 0263-2241; 2214-7853; 0951-192X; 2169-3277; 2572-6668; 1747-7778; 1757-8981; 1687-8086; 2311-5521; 2424-8622
 10.1016/j.procir.2017.02.035,2.0,0951-192X; 0268-3768
 10.1016/j.procir.2017.12.168,9.0,2076-3417; 1742-6596; 0253-3839; 0178-2312; 0954-4054; 1996-1073
 10.1016/j.procir.2018.02.010,5.0,0951-192X; 2071-1050; 2076-3417; 1424-8220
 10.1016/j.procir.2018.03.103,19.0,0161-0457; 1098-1241; 0890-0604; 1367-5788; 1087-1357; 1729-8814; 2267-1242; 2095-8099; 1474-0346; 1532-0626; 0268-3768; 2227-7080; 0005-1055; 1331-677X
 10.1016/j.procir.2018.03.139,6.0,1742-6596; 0951-192X; 0268-3768; 2075-1702; 2391-5439
 10.1016/j.procir.2018.03.166,8.0,0178-2312; 1742-6596; 0268-3768; 0890-0604
 10.1016/j.procir.2018.03.178,8.0,0890-0604; 2504-4494; 1087-1357; 1532-0626; 1474-0346; 1350-6307; 0268-3768; 0020-7543
 10.1016/j.procir.2018.03.192,10.0,2076-3417; 0890-0604; 2504-4494; 2071-1050; 1424-8220; 1474-0346; 1747-7778; 0268-3768; 0020-7543
 10.1016/j.procir.2018.04.076,2.0,2071-1050; 2076-3417
 10.1016/j.procir.2018.04.078,6.0,2076-3417; 0094-114X; 1532-0626; 2075-5309; 0268-3768
 10.1016/j.procir.2019.02.087,3.0,0268-3768; 1742-6596; 1350-6307
 10.1016/j.procir.2019.02.104,3.0,2075-1702; 2095-8293; 2267-1242
 10.1016/j.procir.2019.02.131,2.0,2076-3417; 1751-7575
 10.1016/j.procir.2019.03.072,7.0,2076-3417; 2071-1050; 0951-192X; 2516-8398; 2571-631X
 10.1016/j.procir.2019.03.141,4.0,0951-192X; 2227-9717; 0020-7543; 1087-1357
 10.1016/j.procir.2019.03.182,1.0,1474-0346
 10.1016/j.procir.2019.03.212,1.0,0268-3768
 10.1016/j.procir.2019.03.223,3.0,2071-1050; 2076-3417; 0268-3768
 10.1016/j.procir.2019.04.040,6.0,1087-1357; 0268-3768; 0171-8096; 0020-7543; 1064-1246
 10.1016/j.procir.2019.04.049,2.0,2079-3197; 0020-7543
 10.1016/j.procir.2019.04.084,6.0,2504-4494; 2076-3417; 1424-8220; 0950-7051
 10.1016/j.procir.2019.04.103,3.0,1866-7511; 2076-3417; 1955-2513
 10.1016/j.procir.2019.04.176,2.0,0926-5805; 0020-7543
 10.1016/j.procir.2019.04.219,1.0,2071-1050
 10.1016/j.procir.2019.04.330,3.0,2071-1050; 2076-3417; 2288-6206
 10.1016/j.procir.2020.01.043,1.0,2267-1242
 10.1016/j.procir.2020.01.049,2.0,2076-3417
 10.1016/j.procir.2020.05.020,1.0,1999-5903
 10.1016/j.procs.2017.09.003,11.0,1098-1241; 2156-485X; 2213-7467; 1530-9827; 1087-1357; 2214-7853; 2523-3963; 2213-8463; 0937-7255; 1687-8086; 0041-624X
 10.1016/j.procs.2019.09.032,3.0,2314-4904; 1742-6596; 2424-8622
 10.1016/j.proeng.2014.12.394,6.0,2072-666X; 0264-1275; 1757-8981; 1024-123X; 0143-991X; 0010-4485
 10.1016/j.proeng.2014.12.395,1.0,0268-3768
 10.1016/j.proeng.2015.07.314,10.0,1742-6596; 1757-8981; 1024-123X; 0268-3768; 1755-1307
 10.1016/j.promfg.2017.04.039,8.0,0360-5442; 1742-6596; 2332-9017; 0954-4062; 0951-192X; 0268-3768; 0020-7543
 10.1016/j.promfg.2017.04.043,33.0,0886-7798; 1424-8220; 1868-5137; 0268-3768; 2076-3417; 2332-9017; 2516-8398; 1463-5771; 1996-1073; 2071-1050; 1087-1357; 2227-9717; 1741-038X; 0954-4054; 0020-7543; 0360-5442; 1742-6596; 0951-192X; 2213-8463; 2572-6668; 1751-7575
 10.1016/j.promfg.2017.07.094,20.0,0161-0457; 1742-6596; 1996-1944; 1087-1357; 2267-1242; 0951-192X; 2213-8463; 1350-6307; 0306-4549; 0268-3768; 2075-1702; 0020-7543; 1751-7575
 10.1016/j.promfg.2017.07.198,43.0,2363-7005; 2305-7084; 1350-6307; 0268-3768; 2075-1702; 2076-3417; 1999-5903; 0953-7287; 2516-8398; 1996-1073; 1064-1246; 1134-3060; 2624-6511; 2071-1050; 1087-1357; 2227-9717; 2095-8099; 0020-7543; 1729-8814; 2214-7853; 0951-192X; 1871-6784; 2213-8463; 1747-7778; 0005-1055; 1751-7575
 10.1016/j.promfg.2018.04.008,5.0,1742-6596; 0020-7543; 1754-2731; 1463-5771
 10.1016/j.promfg.2018.06.041,2.0,0268-3768; 2331-1916
 10.1016/j.promfg.2018.06.057,3.0,2234-7593; 0268-3768; 0020-7543
 10.1016/j.promfg.2018.07.146,3.0,0926-5805; 1742-6596; 0951-192X
 10.1016/j.promfg.2018.07.155,6.0,0268-3768; 2095-8099; 0954-4054; 0020-7543
 10.1016/j.promfg.2018.10.047,10.0,2076-3417; 1424-8220; 0951-192X; 1050-0472; 2234-7593; 2288-6206; 1064-1246
 10.1016/j.promfg.2018.10.070,1.0,2234-7593
 10.1016/j.promfg.2019.03.035,4.0,2314-4904; 1092-7026; 0954-4054; 1747-7778
 10.1016/j.promfg.2019.06.097,3.0,1050-0472; 2076-3417
 10.1016/j.promfg.2020.02.084,1.0,0268-3768
 10.1016/j.psep.2019.10.021,3.0,2071-1050; 1749-7728
 10.1016/j.rcim.2018.07.006,6.0,2076-3417; 1424-8220; 1080-3548; 0951-192X; 0268-3768
 10.1016/j.rcim.2019.101881,3.0,2071-1050; 0737-8831; 0268-3768
 10.1016/j.rcim.2019.101895,4.0,0161-0457; 0013-7944; 1367-5788; 0268-3768
 10.1016/j.rcim.2019.101917,2.0,1742-6596; 0020-7543
 10.1016/j.resconrec.2019.06.002,3.0,0360-5442; 2071-1050; 2227-9717
 10.1016/j.scriptamat.2016.12.005,31.0,1047-4838; 0022-0434; 2472-5854; 1424-8220; 0267-0836; 0268-3768; 2076-3417; 2073-4360; 2332-9017; 0307-904X; 2079-6412; 2214-8604; 2352-9407; 2071-1050; 1087-1357; 2095-8099; 1674-7321; 0020-7543; 1073-5623
 10.1016/j.ymssp.2019.106612,5.0,2076-3417; 1070-9622; 1367-5788; 1424-8220; 1087-1357
 10.1017/S089006041900012X,9.0,2076-3417; 2577-8196; 0890-0604; 2504-4494; 1742-6596; 1359-4311; 1474-0346; 1996-1073
 10.1038/s41467-020-19059-3,2.0,1616-301X; 2397-4621
 10.1049/iet-cim.2020.0009,1.0,1367-5788
 10.1049/iet-cim.2020.0041,9.0,2076-3417; 2424-8622
 10.1049/iet-epa.2018.5732,1.0,1367-5788
 10.1051/e3sconf/201912301049,1.0,2267-1242
 10.1051/e3sconf/201914004017,1.0,2267-1242
 10.1051/e3sconf/201914006008,2.0,2267-1242
 10.1051/matecconf/201930011004,2.0,2214-8604; 1047-4838
 10.1061/(ASCE)CO.1943-7862.0000825,1.0,1474-0346
 10.1061/(ASCE)ME.1943-5479.0000740,4.0,0926-5805; 2071-1050; 1474-0346; 1424-8220
 10.1061/(ASCE)ME.1943-5479.0000741,3.0,2071-1050; 2227-9717; 0360-3199
 10.1061/(ASCE)ME.1943-5479.0000745,3.0,1093-9687; 2364-8228; 1869-5450
 10.1061/(ASCE)ME.1943-5479.0000748,1.0,1424-8220
 10.1061/(ASCE)ME.1943-5479.0000774,1.0,2079-9292
 10.1061/(ASCE)ME.1943-5479.0000779,1.0,1562-3599
 10.1061/(ASCE)ME.1943-5479.0000797,1.0,0926-5805
 10.1063/1.5031520,7.0,0001-9240; 2071-1050; 1548-1115; 0010-485X; 0268-3768; 0020-7543; 2571-631X
 10.1063/1.5034337,1.0,1424-8220
 10.1063/1.5099723,2.0,0888-3270; 1475-9217
 10.1063/1.5128374,2.0,0307-904X; 2311-5521
 10.1080/00207543.2018.1443229,33.0,2076-3417; 2071-1050; 2332-9017; 1729-8814; 0951-192X; 1050-0472; 1868-5137; 2516-8398; 0268-3768; 0020-7543; 0307-904X; 2073-8994
 10.1080/00207543.2018.1471243,31.0,2076-3417; 0096-3003; 2071-1050; 0013-7944; 1424-8220; 1729-8814; 0951-192X; 2234-7593; 2190-7188; 0926-5805; 0268-3768; 0954-4054; 1562-2479; 0020-7543
 10.1080/00207543.2018.1497819,11.0,0360-5442; 2071-1050; 0951-192X; 2516-8398; 0020-7543
 10.1080/00207543.2018.1552032,22.0,2076-3417; 1070-9622; 2071-1050; 1367-5788; 1729-8814; 0360-3199; 1024-123X; 0268-3768; 0020-7543; 1996-1073; 0307-904X; 2073-8994
 10.1080/00207543.2019.1566661,34.0,0965-9978; 2076-3417; 2504-4494; 2071-1050; 1367-5788; 1424-8220; 2624-9375; 0951-192X; 1469-1930; 1474-0346; 0950-7051; 0268-3768; 1743-6753; 0954-4054; 0020-7543
 10.1080/00207543.2019.1581387,9.0,2076-3417; 1530-9827; 1424-8220; 1050-0472; 0268-3768; 0020-7543
 10.1080/00207543.2019.1607978,14.0,0360-5442; 2076-3417; 1424-8220; 1087-1357; 2095-8099; 0268-3768; 0020-7543
 10.1080/00207543.2019.1662133,4.0,0268-3768; 2095-8099; 0020-7543
 10.1080/00207543.2019.1683250,3.0,0020-7543; 2227-9717
 10.1080/00207543.2020.1714091,3.0,0020-7543; 0021-9983
 10.1080/01691864.2017.1297735,2.0,0926-5805; 1424-8220
 10.1080/01969722.2019.1705554,1.0,0360-5442
 10.1080/07373937.2014.962143,19.0,1350-4177; 0360-5442; 2076-3417; 0145-8892; 1866-7910; 0737-3937; 0947-7411; 2352-4847; 0145-8876
 10.1080/0951192X.2018.1529430,15.0,2076-3417; 1530-9827; 2332-9017; 1424-8220; 1080-3548; 0951-192X; 0268-3768
 10.1080/0951192X.2019.1599433,1.0,0020-7543
 10.1080/0951192X.2019.1599436,6.0,0951-192X; 0268-3768; 2076-3417; 2504-4494
 10.1080/0951192X.2019.1599439,9.0,2071-1050; 0263-2241; 0951-192X; 2288-6206; 2190-7188; 0268-3768; 0020-7543; 1064-1246
 10.1080/0951192X.2019.1686173,9.0,2076-3417; 2071-1050; 0263-2241; 1424-8220; 1087-1357; 0951-192X; 0020-7543
 10.1080/0951192X.2019.1699254,2.0,2071-1050; 2076-3417
 10.1080/0951192X.2020.1747642,1.0,2168-1163
 10.1080/10494820.2013.815221,55.0,1361-4568; 1359-4338; 1574-017X; 1360-2357; 2472-5854; 1463-922X; 1955-2513; 1868-5137; 0268-3768; 1049-4820; 2077-1312; 2095-2686; 0007-1013; 2076-3417; 1080-3548; 0018-7208; 2468-2322; 0141-9331; 0735-6331; 0144-929X; 1042-1629; 2073-431X; 1044-8004; 2071-1050; 0265-671X; 2414-4088; 0020-7543; 1044-7318; 2296-9144; 0952-1976; 1742-6596; 2261-236X
 10.1080/14786451.2012.724070,21.0,1744-2591; 0199-6231; 1556-3758; 1866-7910; 0737-3937; 2095-1701; 2071-1050; 1556-7036; 2523-3963; 0143-0750; 0947-7411; 1543-5075; 2352-4847; 1064-1246; 2050-0505
 10.1080/15732479.2019.1620789,4.0,1573-2479; 1687-8086; 2076-3417
 10.1080/16864360.2018.1462569,9.0,0360-5442; 1530-9827; 2193-9764; 2332-9017; 0951-192X; 1050-0472; 0954-4054
 10.1080/17517575.2018.1526324,14.0,2076-3417; 0888-3270; 0954-4054; 2071-1050; 1424-8220; 1729-8814; 2227-9717; 0951-192X; 2234-7593; 2391-5439; 0020-7543
 10.1080/21693277.2019.1660283,3.0,0951-192X; 0268-3768; 2076-3417
 10.1080/25726668.2019.1569367,5.0,0161-0457; 0360-3199; 0951-192X; 2572-6668; 2073-8994
 10.1080/25726668.2019.1645519,2.0,2572-6668; 2073-8994
 10.1088/0957-0233/24/9/095401,1.0,0040-5175
 10.1088/1674-4527/20/5/67,1.0,1674-4527
 10.1088/1742-6596/1168/2/022044,1.0,1573-062X
 10.1088/1742-6596/1391/1/012083,1.0,2571-631X
 10.1088/1742-6596/1618/2/022065,1.0,2071-1050
 10.1088/1757-899X/156/1/012002,1.0,1742-6596
 10.1088/1757-899X/324/1/012077,6.0,1134-3060; 2076-3417; 2640-4567; 1359-4311; 0020-7543
 10.1088/1757-899X/459/1/012075,1.0,0040-5175
 10.1088/1757-899X/711/1/012017,1.0,2079-6412
 10.1088/1757-899X/739/1/012048,1.0,2261-236X
 10.1108/CI-11-2019-0133,4.0,0926-5805; 1562-3599; 0969-9988; 2071-1050
 10.1108/ECAM-11-2019-0640,3.0,2071-1050; 1472-5967
 10.1108/IMDS-01-2018-0033,5.0,0360-5442; 0268-3768; 0951-192X; 1741-038X; 1463-7154
 10.1108/RPJ-06-2012-0058,15.0,2214-8604; 2411-5134; 1355-2546; 1738-494X; 2523-3963; 0268-3768; 1955-2513; 2631-8695
 10.1108/RPJ-12-2016-0210,4.0,2076-3417; 0268-3768; 0010-4485; 0730-6679
 10.1109/ACCESS.2017.2657006,57.0,2079-9292; 1077-5463; 1424-8220; 1532-0626; 1868-5137; 2288-6206; 0268-3768; 2076-3417; 2334-5837; 1386-145X; 2332-9017; 0141-9331; 1474-0346; 0926-5805; 1996-1073; 2050-7038; 0890-0604; 2071-1050; 2227-9717; 0010-485X; 1074-5351; 0954-4054; 0020-7543; 2078-2489; 2073-8994; 1742-6596; 0951-192X; 2213-8463; 2572-6668; 0929-6212; 1747-7778; 1617-9846; 1751-7575; 2424-8622
 10.1109/ACCESS.2017.2756069,56.0,2040-4166; 0954-4062; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 0953-7287; 1474-0346; 2516-8398; 2227-7080; 0965-9978; 2071-1050; 2156-485X; 1367-5788; 2227-9717; 2095-8099; 0954-4054; 0020-7543; 0360-5442; 1530-9827; 1742-6596; 1729-8814; 0951-192X; 2234-7593; 1747-7778
 10.1109/ACCESS.2017.2766453,35.0,2050-7038; 2076-3417; 2079-9292; 0890-0604; 1424-8220; 1729-8814; 2267-1242; 0951-192X; 0944-6524; 1050-0472; 1868-5137; 2095-8099; 0268-3768; 2075-1702; 0954-4054; 2391-5439; 0020-7543
 10.1109/ACCESS.2018.2793265,66.0,1573-062X; 1866-7910; 2041-1723; 2079-9292; 2079-3197; 1424-8220; 0029-5981; 1868-5137; 2288-6206; 0268-3768; 1741-0398; 2075-1702; 2333-5777; 2076-3417; 1746-5664; 1478-0771; 0953-7287; 1092-7026; 2332-9017; 1471-4175; 1474-0346; 0138-9130; 0926-5805; 2227-7080; 2352-9407; 2071-1050; 1367-5788; 1087-1357; 1469-1930; 2095-8099; 0265-671X; 0954-4054; 0020-7543; 0360-5442; 2058-8437; 1742-6596; 0951-192X; 1476-1122
 10.1109/ACCESS.2018.2890566,7.0,2076-3417; 0888-3270; 1367-5788; 0951-192X; 0925-2312; 2078-2489
 10.1109/ACCESS.2019.2891060,5.0,1367-5788; 0020-7543; 1729-8814
 10.1109/ACCESS.2019.2893309,5.0,0161-0457; 0951-192X; 1678-5878; 1063-293X; 0020-7543
 10.1109/ACCESS.2019.2897018,4.0,0268-3768; 0013-7944; 1424-8220; 1751-7575
 10.1109/ACCESS.2019.2909828,11.0,1134-3060; 2079-9292; 2398-6352; 1617-4909; 1424-8220; 2214-7853; 0010-485X; 1793-9623; 1751-7575
 10.1109/ACCESS.2019.2923610,8.0,2076-3417; 2640-4567; 1742-6596; 0263-2241; 1424-8220; 2524-521X; 2095-8099
 10.1109/ACCESS.2019.2928141,9.0,2076-3417; 1424-8220; 2072-4292; 1678-5878; 2073-8994
 10.1109/ACCESS.2019.2946515,4.0,2071-1050; 1367-5788; 0969-9988
 10.1109/ACCESS.2019.2950507,2.0,2076-3417; 1424-8220
 10.1109/ACCESS.2019.2950955,1.0,1367-5788
 10.1109/ACCESS.2019.2953499,7.0,2076-3417; 1098-1241; 2073-431X; 1424-8220; 2213-8463
 10.1109/ACCESS.2020.2970143,10.0,2076-3417; 2071-1050; 1742-6596; 2470-0045; 2227-9717
 10.1109/ACCESS.2020.2971576,1.0,2073-431X
 10.1109/ACCESS.2020.2974241,1.0,0268-3768
 10.1109/ACCESS.2020.2974810,3.0,2504-4494; 2071-1050; 1424-8220
 10.1109/ACCESS.2020.2981745,13.0,2076-3417; 2161-3915; 1225-6463; 2079-9292; 2071-1050; 1999-5903; 2327-0012; 1424-8220; 1687-1499; 2424-8622
 10.1109/ACCESS.2020.2998358,8.0,2076-3417; 2071-1050; 1742-6596; 1424-8220; 0096-3003; 1996-1073; 2571-631X
 10.1109/ACCESS.2020.2998723,3.0,1050-0472; 1474-0346; 2571-631X
 10.1109/ACCESS.2020.2999871,2.0,2040-2295; 1424-8220
 10.1109/ACCESS.2020.3000437,1.0,1424-8220
 10.1109/AIM.2018.8452707,1.0,0951-192X
 10.1109/BigData.2018.8622412,1.0,0020-7543
 10.1109/CIST.2018.8596460,1.0,0268-3768
 10.1109/COASE.2019.8842888,1.0,2267-1242
 10.1109/COASE.2019.8843166,2.0,2214-8604
 10.1109/COASE.2019.8843269,1.0,0361-7688
 10.1109/CyberC.2014.30,1.0,1942-4787
 10.1109/EDOCW.2018.00021,2.0,0951-192X; 0020-7543
 10.1109/EIConRus.2019.8656681,1.0,1229-7607
 10.1109/EMBC.2014.6943963,1.0,1955-2513
 10.1109/ETFA.2017.8247583,3.0,2288-5048; 1424-8220; 2411-5134
 10.1109/ETFA.2017.8247712,2.0,0951-192X; 1687-8086
 10.1109/ETFA.2018.8502467,3.0,0178-2312; 2424-8622
 10.1109/ETFA.2019.8868954,1.0,0268-3768
 10.1109/GHTC46095.2019.9033075,1.0,1999-5903
 10.1109/GLOBECOM38437.2019.9013428,1.0,2076-3417
 10.1109/HRI.2019.8673015,1.0,2296-9144
 10.1109/ICAC.2016.29,13.0,1433-2779; 0951-192X; 2364-415X; 1474-0346; 0010-485X; 2288-6206; 1741-038X; 0268-3768; 0020-7543
 10.1109/ICCCBDA.2018.8386518,1.0,0268-3768
 10.1109/ICCVE45908.2019.8965086,1.0,1424-8220
 10.1109/ICE.2019.8792577,1.0,1092-7026
 10.1109/ICE.2019.8792613,1.0,2523-3963
 10.1109/ICE.2019.8792622,1.0,2523-3963
 10.1109/ICE/ITMC49519.2020.9198403,1.0,2076-3417
 10.1109/ICIMTech.2019.8843814,1.0,2267-1242
 10.1109/ICITM48982.2020.9080395,1.0,1999-5903
 10.1109/ICNSC.2018.8361283,1.0,0268-3768
 10.1109/ICNSC.2018.8361285,1.0,1474-0346
 10.1109/ICNSC.2018.8361293,3.0,0268-3768; 1742-6596
 10.1109/ICSE-NIER.2019.00011,1.0,2524-8510
 10.1109/ICTC.2018.8539690,1.0,0951-192X
 10.1109/IECON.2018.8591464,2.0,2076-3417; 0009-286X
 10.1109/IECON.2018.8591653,1.0,1742-6596
 10.1109/IEEM.2017.8289898,5.0,0951-192X; 0268-3768; 2076-3417; 2424-8622
 10.1109/INDIN.2016.7819217,8.0,2071-1050; 0951-192X; 1868-5137; 0937-7255; 0268-3768; 0020-7543
 10.1109/INDIN.2018.8471979,3.0,0268-3768; 2210-464X; 2424-8622
 10.1109/INDIN.2018.8472014,2.0,0951-192X; 2504-4494
 10.1109/INDIN.2018.8472083,1.0,2504-4494
 10.1109/INDIN41052.2019.8972134,2.0,2504-4494; 1424-8220
 10.1109/INDIN41052.2019.8972267,1.0,2267-1242
 10.1109/IROS.2016.7759171,1.0,0253-3839
 10.1109/IS.2018.8710526,1.0,1742-6596
 10.1109/ISIE.2019.8781529,1.0,2523-3963
 10.1109/ITAIC.2019.8785703,1.0,2523-3963
 10.1109/JPROC.2017.2725482,6.0,2051-3305; 2079-9292; 0164-1212; 2213-8463; 1049-8923; 1687-1499
 10.1109/JPROC.2020.2998530,5.0,1871-6784; 1999-5903; 2076-3417; 2192-113X
 10.1109/JSYST.2019.2925627,1.0,1996-1073
 10.1109/KBEI.2015.7436192,1.0,2214-7853
 10.1109/M2VIP.2018.8600844,2.0,2076-3417; 1474-0346
 10.1109/MLSD.2018.8551867,1.0,2267-1242
 10.1109/MSP.2018.2842228,3.0,2476-1508; 0268-3768
 10.1109/MetroInd4.0IoT48571.2020.9138264,1.0,1996-1073
 10.1109/NAPS46351.2019.9000371,1.0,1999-5903
 10.1109/PICMET.2016.7806826,2.0,2161-3915
 10.1109/RUSAUTOCON.2019.8867800,1.0,1742-6596
 10.1109/SAS.2019.8706111,1.0,1999-5903
 10.1109/SEsCPS.2019.00012,2.0,0953-5438; 2076-3417
 10.1109/SIBIRCON48586.2019.8958367,1.0,0361-7688
 10.1109/SII.2017.8279217,1.0,1742-6596
 10.1109/SPEEDAM.2018.8445302,1.0,0937-7255
 10.1109/SSCI.2017.8285439,1.0,0003-682X
 10.1109/SYSCON.2017.7934796,3.0,0041-624X; 2334-5837
 10.1109/SYSOSE.2018.8428748,1.0,0951-192X
 10.1109/SYSOSE.2019.8753845,1.0,0268-3768
 10.1109/SYSOSE.2019.8753860,1.0,2523-3963
 10.1109/SysEng.2016.7753162,7.0,1742-6596; 2214-7853; 0951-192X; 2213-8463; 0268-3768; 0005-1055; 1996-1073
 10.1109/TII.2018.2804917,19.0,2076-3417; 2071-1050; 1092-7026; 1741-0401; 1424-8220; 0951-192X; 0141-9331; 0178-2312; 1868-5137; 1350-6307; 0926-5805; 0268-3768; 2075-1702; 0020-7543
 10.1109/TII.2018.2873186,46.0,0888-3270; 1573-062X; 1098-1241; 1424-8220; 1868-5137; 0268-3768; 0026-2714; 2076-3417; 1474-0346; 0926-5805; 1996-1073; 0307-904X; 0306-2619; 2071-1050; 1367-5788; 1087-1357; 0950-7051; 0160-5682; 0954-4054; 0020-7543; 2073-8994; 0360-5442; 0013-7944; 1742-6596; 2470-0045; 2523-3963; 2214-7853; 0951-192X
 10.1109/TII.2019.2938885,1.0,1996-1073
 10.1109/TII.2020.2977113,1.0,1424-8220
 10.1109/TLA.2020.9082917,1.0,1999-5903
 10.1109/TPEL.2019.2911594,8.0,1367-5788; 1742-6596; 0263-2241; 2194-5756; 0894-3370; 1996-1073; 2073-8994
 10.1109/TSMC.2019.2930418,4.0,2071-1050; 0020-7543; 1424-8220
 10.1109/TWC.2019.2927312,2.0,0268-3768; 1999-4893
 10.1109/UCC-Companion.2018.00039,1.0,0361-7688
 10.1109/WF-IoT.2018.8355217,1.0,2214-7853
 10.1109/WSC.2018.8632242,3.0,2076-3417; 2523-3963; 2213-8463
 10.1109/WSC40007.2019.9004659,2.0,2076-3417; 0020-7543
 10.1111/caim.12082,8.0,0963-1690; 0954-4062; 1363-9196; 1955-2513; 0934-9839
 10.1111/cgf.14023,1.0,0167-7055
 10.1111/exsy.12064,7.0,1042-6914; 0952-1976; 0254-5330; 0377-2217; 0361-2317; 1687-5265; 0950-7051
 10.1117/12.2042170,2.0,1674-1056; 1644-9665
 10.1134/S0005117915080111,1.0,0005-1179
 10.1145/3061639.3079847,1.0,2214-8604
 10.1155/2011/154798,65.0,0888-3270; 1047-4838; 0001-1452; 1868-5137; 1350-6307; 2364-415X; 0268-3768; 2075-1702; 1687-5966; 2076-3417; 2193-9764; 2332-9017; 2468-2322; 1475-9217; 0926-5805; 0307-904X; 0161-0457; 0268-4012; 2071-1050; 2156-485X; 1367-5788; 2227-9717; 0360-3199; 1050-0472; 0020-7543; 0022-460X; 1478-422X; 0013-7944; 1530-9827; 0951-192X; 2213-8463; 1687-8086; 2261-236X; 1751-7575
 10.1155/2013/263218,1.0,1687-725X
 10.1155/2014/439278,17.0,0376-9429; 2451-9049; 2071-1050; 0013-7944; 1367-5788; 2332-9017; 1424-8220; 2214-7853; 0094-9930; 1868-5137; 2364-415X; 0268-3768; 2571-631X
 10.1155/2014/648562,2.0,0264-1275; 2073-4360
 10.1155/2020/8888876,2.0,1424-8220
 10.1177/0040517516632471,5.0,1558-9250; 0040-5175
 10.1177/0954405412463857,14.0,2214-8604; 1355-2546; 2050-7526; 0264-1275; 1087-1357; 1779-6288; 1050-0472; 1745-2759; 0268-3768; 0954-4054
 10.1177/0954405413500663,1.0,1748-006X
 10.1177/0954406212473037,4.0,1674-7321; 0954-4828; 0954-4062
 10.1177/0954406219854466,3.0,1050-0472; 1424-8220; 1087-1357
 10.1177/1847979019828570,2.0,1847-9790
 10.1186/s40323-020-00147-4,1.0,0268-3768
 10.13196/j.cims.2017.01.001,1.0,1868-5137
 10.13196/j.cims.2017.04.010,1.0,2076-3417
 10.13196/j.cims.2017.08.001,1.0,0951-192X
 10.13196/j.cims.2019.01.001,2.0,1687-8086; 1742-6596
 10.1515/auto-2017-0133,4.0,0178-2312; 1093-9687; 2073-8994
 10.1515/auto-2019-0039,5.0,0178-2312; 2076-3417; 1367-5788; 1742-6596
 10.1515/eng-2020-0039,1.0,1424-8220
 10.1515/eng-2020-0040,1.0,2076-3417
 10.1515/itit-2017-0038,1.0,0951-192X
 10.1515/mspe-2019-0004,2.0,2076-3417
 10.1515/orga-2017-0017,20.0,2076-3417; 1746-5664; 2043-9377; 2071-1050; 0001-5970; 0953-7287; 1955-2513; 0951-192X; 1868-5137; 2288-6206; 1747-7778; 1366-5545; 0954-4054
 10.15439/2017F253,1.0,1424-8220
 10.21917/ijsc.2016.0173,2.0,1947-9344; 0256-2499
 10.2514/1.J055201,25.0,0888-3270; 1098-1241; 0001-1452; 1424-8220; 0748-8017; 0029-5981; 1868-5137; 1350-6307; 0268-3768; 2076-3417; 2332-9017; 0307-904X; 2571-631X; 1367-5788; 0954-4054; 2073-8994; 0013-7944; 1530-9827; 0951-192X; 0892-7219; 1024-123X; 1573-2479
 10.3103/S1068798X19060194,1.0,1068-798X
 10.3103/S1068798X19100101,1.0,1068-798X
 10.3103/S1068798X19120104,2.0,1742-6596; 1996-1073
 10.3233/FI-2020-1943,2.0,2078-2489; 1367-4803
 10.3390/app10020486,2.0,2076-3417; 2414-4088
 10.3390/app10072377,2.0,2076-3417; 1424-8220
 10.3390/app10082854,1.0,1424-8220
 10.3390/app10103342,3.0,2076-3417
 10.3390/app10103633,1.0,2076-3417
 10.3390/app10134678,2.0,2071-1050; 1424-8220
 10.3390/app10186519,2.0,2076-3417
 10.3390/app9183780,8.0,2076-3417; 2075-5309; 1471-4175
 10.3390/app9245567,5.0,1099-4300; 2076-3417
 10.3390/designs4020009,1.0,1742-6596
 10.3390/electronics9020319,6.0,2079-6374; 1424-8220; 1996-1073
 10.3390/en12101909,8.0,2079-6412; 2073-4352; 1996-1073
 10.3390/en12122389,7.0,0360-5442; 2073-4352; 1996-1073
 10.3390/en13184762,1.0,1996-1073
 10.3390/en13184979,1.0,1996-1944
 10.3390/en13205413,1.0,2075-1702
 10.3390/fi12090159,1.0,2076-3417
 10.3390/fi12100163,1.0,2076-3417
 10.3390/ijgi6070208,1.0,2267-1242
 10.3390/ijgi9040228,3.0,2071-1050; 2412-3811; 1996-1073
 10.3390/jmmp4030092,1.0,2504-4494
 10.3390/jmse8030200,3.0,2076-3417; 1996-1073; 2077-1312
 10.3390/machines7010002,3.0,0141-9331; 1076-2787; 2076-3417
 10.3390/mi11060614,3.0,0264-1275; 2073-4360; 0032-3888
 10.3390/pr7020094,12.0,1871-6784; 2227-9717; 2296-4185
 10.3390/pr7080537,1.0,0263-2241
 10.3390/pr8070866,1.0,2227-9717
 10.3390/s17112488,12.0,2076-3417; 1424-8220; 2504-446X; 2072-4292; 1385-2256; 0378-3774; 0143-1161; 2078-2489
 10.3390/s19173781,6.0,2227-9717; 1424-8220
 10.3390/s19204410,1.0,0268-3768
 10.3390/s20010097,4.0,2071-1050; 2076-3417; 1424-8220
 10.3390/s20041187,1.0,1687-8086
 10.3390/s20123515,2.0,1424-8220
 10.3390/s20133709,2.0,2071-1050; 1424-8220
 10.3390/s20164637,1.0,2073-8994
 10.3390/s20175003,1.0,1424-8220
 10.3390/su11010159,10.0,2071-1050; 1940-1493; 1996-1073
 10.3390/su11185036,1.0,1024-123X
 10.3390/su12030936,3.0,2071-1050; 1424-8220
 10.3390/su12031088,7.0,2076-3417; 2071-1050; 1742-6596; 1424-8220; 1474-0346; 0020-7543
 10.3390/su12062286,6.0,2071-1050; 2076-3417; 1996-1073
 10.3390/su12062307,5.0,2624-6511; 2071-1050; 2220-9964; 1424-8220
 10.3390/su12072940,1.0,2071-1050
 10.3991/ijoe.v13i08.7270,1.0,2218-6581
 10.4028/www.scientific.net/AMM.575.493,9.0,0178-7675; 0263-2241; 1424-8220; 0264-4401; 0268-3768; 0941-0643; 0217-9792; 2053-1591
 10.4028/www.scientific.net/AMR.472-475.206,1.0,0020-7403
 10.4028/www.scientific.net/MSF.957.340,1.0,1022-1360
--- a/doi_parse/合并统计.py
+++ b/doi_parse/合并统计.py
@ -0,0 +1,40 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2022/6/6 10:26
 # @Author  : ZhaoXiangPeng
 # @File    : 合并统计.py
 import pandas as pd
 from 数据处理.文件合并 import join, load_file
 path = 'E:/inspec合并/'
 files = load_file(path)
 # files = [f'{path}{f}' for f in files]
 print(files)
 def count(file_name: str):
    temp_df: pd.DataFrame = pd.read_csv(file_name)
    # group: pd.Series = temp_df.groupby(by=['doi'])['count'].sum()
    # return pd.DataFrame(data=group)
    return temp_df
 df0 = pd.read_csv(r'F:\工作数据存储2022\20220526_inspec测试\Digital twin searching result 2.csv')
 df0 = df0[['DI']]
 df0.drop_duplicates(inplace=True)
 big_df = pd.DataFrame()
 for f in files:
    t_df = count(path+f)
    # 重设索引 以doi链接获取count
    t_df = t_df.reset_index()
    ts = pd.merge(df0, t_df, how='left', left_on=['DI'], right_on=['doi'])
    ts = ts[ts['count'].notnull()]
    big_df = pd.concat([big_df, ts], ignore_index=True)
 pp = []
 group_2 = big_df.groupby(by=['doi'])
 for _, g in group_2:
    row = {'doi': _, 'count': g['count'].sum(), 'issn': '; '.join(list(set(g['issn'])))}
    pp.append(row)
 df = pd.DataFrame(data=pp)
 df.to_csv('inspec数据库施引测试数据.csv', index=False)