diff --git a/article_subject/__init__.py b/article_subject/__init__.py new file mode 100644 index 0000000..e4c8aa5 --- /dev/null +++ b/article_subject/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/21 8:50 +# @Author : ZhaoXiangPeng +# @File : __init__.py diff --git a/article_subject/extract_score.py b/article_subject/extract_score.py new file mode 100644 index 0000000..ca5fbd9 --- /dev/null +++ b/article_subject/extract_score.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/5/4 16:33 +# @Author : zhaoxiangpeng +# @File : extract_score.py +# 2023年6月6日09:52:59修改首先要对 进行聚合,分为多个表,计算分数 + +import os +import pandas as pd +from typing import Union +import data_process_tool +from article_subject.utils import get_row_top_join_sub, get_row_top + +get_subject = get_row_top_join_sub + + +class GroupScore: + def __init__(self, by_column, base_columns: list = None): + self.columns = base_columns + + def get_score(self, table_or_file: Union[pd.DataFrame, str]) -> pd.DataFrame: + if isinstance(table_or_file, str): + table_or_file = data_process_tool.read_data(table_or_file) + assert isinstance(table_or_file, pd.DataFrame) + groups = table_or_file.groupby(by=['']) + + +class BaseExtractScore: + def __init__(self, table: Union[pd.DataFrame, str], base_columns: list = None, score_columns: list = None): + self._table = table + self._columns = base_columns + + def process(self) -> pd.DataFrame: + if isinstance(self._table, str): + self._table = data_process_tool.read_data(self._table) + + # 把基本信息列当做行索引,计算分数值 + del_columns = set(self._table.columns) - set(self._columns) + self._table.set_index(keys=self._columns, inplace=True) + self._table['高分学科'] = self._table.apply(get_subject, axis=1) + self._table.reset_index(inplace=True) + + self._table.drop(labels=list(del_columns), axis=1, inplace=True) + return self._table + + +def task1(): + PATH = 'F:/工作数据存储2023/20230426_评分模型/' + file = os.path.join(PATH, '知识视界分类数据(已人工判断)_评分模型.csv') + ins = BaseExtractScore( + table=file, + base_columns=['Title', 'ABS', 'title_en', 'Title_ABS', 'cncole', '教育部学科门类', '教育部一级学科', '教育部二级学科', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确', 'Unnamed: 13'] + ) + ta = ins.process() + ta.to_csv(os.path.join(PATH, '知识视界分类数据(已人工判断)_评分模型-高分学科.csv'), index=False) + + +def task2(): + PATH = 'F:/工作数据存储2023/20230426_评分模型/' + file = os.path.join(PATH, 'jove-中图分类号数据_评分模型.csv') + ins = BaseExtractScore( + table=file, + base_columns=['Genre - 655 indicatior 1 and 2 " 4"', 'Subjects - 650 indicator 1 and 2 " 4"', 'Additional Material Characteristics 006 “m o c “', 'Physical Description Fixed Field - 007 “cr unu”', 'Video ID 001', 'Video Name 245 $a indicator 1 and 2 "00"', '856 $3 inidcatior 1 and 2 "40"', 'Link-856 $u', 'Physical Decription-300 $a', 'Format-', 'Runnng Time-', 'Subtitle/caption language codes (as ISO 3-letter codes separated by semicolons)', 'Chapter Number-', 'Formatted Contents: Chapter-505 $a', 'Content Type 336 “$btdi $2rdacontent”', 'Material Type 337 "$bc $2rdamedia”', 'Carrier Type - 338 “$bcr$2rdacarrier"', 'Summary - 520 $a', 'Source of Description Note-588 $a', 'Date-260 $c', 'Publisher-260 $b', 'City-260 $a', 'Country Code-008', 'Language-008', 'Series-490', 'ISSN-022 $a', 'cnCode', '教育部学科门类', '教育部一级学科', '教育部二级学科', '未能识别的clc', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确'] + ) + ta = ins.process() + ta.to_csv(os.path.join(PATH, 'jove-中图分类号数据_评分模型-高分学科.csv'), index=False) + + +def task3(): + PATH = 'Z:/数据处理流程/' + file = os.path.join(PATH, '0028-4793_评分模型.csv') + ins = BaseExtractScore( + table=file, + base_columns=['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT', 'cncode', '一级学科', '二级学科', '一级学科数目', '二级学科数目'] + ) + ta = ins.process() + ta.to_csv(os.path.join(PATH, '0028-4793-高分学科.csv'), index=False) + + +if __name__ == '__main__': + # task1() + # task2() + task3() diff --git a/article_subject/main.py b/article_subject/main.py new file mode 100644 index 0000000..f8e7a05 --- /dev/null +++ b/article_subject/main.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/21 8:56 +# @Author : ZhaoXiangPeng +# @File : main.py + +import pandas as pd +from article_subject.utils import get_row_top, merge_table, get_today + + +def func(row: pd.Series, num: int = 3): + x, y, z = row['教育部一级学科'],row['HighScoreSubject'],row['HighScoreSubjectJournal'] + subjects = ';'.join([x, y, z]).split(';') + # subjects = '' + # subjects += x and x + # subjects += ';' + y and y + # subjects += ';' + z and z + sub_count = {} + for sub in subjects: + if sub in sub_count: + sub_count[sub] += 1 + else: + sub_count[sub] = 1 + sub_list = [] + sub_sort = sorted(sub_count.items(), key=lambda x: -x[1]) + for sub in sub_sort: + # 出现次数2次以上 + if len(sub_list) < 3 and sub[1] > 2: + sub_list.append(sub[0]) + return ';'.join(sub_list) + + +class Article2Subject: + def __init__(self): + self.periodical = None + + def periodical_top(self, filename): + periodical = pd.read_csv(filename) + periodical.set_index(['issn'], inplace=True) + periodical['HighScoreSubjectJournal'] = periodical.apply(get_row_top, axis=1) + periodical = periodical[['HighScoreSubjectJournal']] + periodical.reset_index(inplace=True) + self.periodical = periodical + return periodical + + def tc_huang_subject(self, filename): + tc_huang = pd.read_csv(filename) + tc_huang = tc_huang[tc_huang['一级学科'].notnull()] + tc_huang_list = [] + group = tc_huang.groupby(by=['issn']) + for _, g in group: + tc_huang_list.append({'issn': _, 'HighScoreSubjectJournal': ';'.join(g['一级学科'])}) + tc_huang = pd.DataFrame(tc_huang_list) + self.tc_huang = tc_huang + return tc_huang + + def join_journal_subject(self): + journal_subject = pd.concat([self.periodical, self.tc_huang]) + journal_subject.drop_duplicates(subset=['issn'], keep='last', inplace=True) + self.journal_subject = journal_subject + + def score_model(self, filename): + base_columns = ['论文标题', '文献类型', '发表年份', '卷', '期', '发表月份', 'WOS核心合集被引频次', 'Scopus被引频次', '语言', '开始页数', '结束页数', 'WOSID', 'EID', '出版物名称', 'ISSN', '出版社', 'SCIE收录', 'SSCI收录', 'ESCI收录', 'A&HCI收录', 'EI收录', 'SCOPUS收录', 'snip值', 'sjr值', 'JCR收录', 'Q区间', 'IF值', 'JCR学科', 'ESI收录', 'ESI学科', '教育部门类', '教育部一级学科', '教育部二级学科', 'CSSCI收录', 'CSCD收录', 'AF', '作者列表', '是否第一作者', '第一作者机构', '第一作者', 'RP', '通讯作者地址', '是否通讯作者', '通讯作者机构', '通讯作者', 'C1', '作者地址', '本校机构信息', '本校学者排序', '本校学者信息', '摘要', 'sc'] + # del_columns = ['论文标题', '文献类型', '发表年份', '卷', '期', '发表月份', 'WOS核心合集被引频次', 'Scopus被引频次', '语言', '开始页数', '结束页数', 'WOSID', 'EID', '出版物名称', '出版社', 'SCIE收录', 'SSCI收录', 'ESCI收录', 'A&HCI收录', 'EI收录', 'SCOPUS收录', 'snip值', 'sjr值', 'JCR收录', 'Q区间', 'IF值', 'JCR学科', 'ESI收录', 'ESI学科', '教育部门类', '教育部一级学科', '教育部二级学科', 'CSSCI收录', 'CSCD收录', 'AF', '作者列表', '是否第一作者', '第一作者机构', '第一作者', 'RP', '通讯作者地址', '是否通讯作者', '通讯作者机构', '通讯作者', 'C1', '作者地址', '本校机构信息', '本校学者排序', '本校学者信息', '摘要', 'sc'] + score = pd.read_csv(filename) + score_columns = ['军事学_军事后勤学与军事装备学', '军事学_军事思想及军事历史', '军事学_军事管理学', '军事学_军事训练学', '军事学_军队指挥学', '军事学_军队政治工作学', '军事学_战役学', '军事学_战术学', '军事学_战略学', '农学_作物学', '农学_兽医学', '农学_农业资源与环境', '农学_园艺学', '农学_林学', '农学_植物保护', '农学_水产', '农学_畜牧学', '农学_草学', '医学_中医学', '医学_中药学', '医学_中西医结合', '医学_临床医学', '医学_公共卫生与预防医学', '医学_医学技术', '医学_口腔医学', '医学_基础医学', '医学_护理学', '医学_特种医学', '医学_药学', '历史学_世界史', '历史学_中国史', '历史学_考古学', '哲学_哲学', '工学_交通运输工程', '工学_仪器科学与技术', '工学_信息与通信工程', '工学_光学工程', '工学_公安技术', '工学_兵器科学与技术', '工学_农业工程', '工学_冶金工程', '工学_力学', '工学_动力工程及工程热物理', '工学_化学工程与技术', '工学_土木工程', '工学_地质资源与地质工程', '工学_城乡规划学', '工学_安全科学与工程', '工学_建筑学', '工学_控制科学与工程', '工学_机械工程', '工学_材料科学与工程', '工学_林业工程', '工学_核科学与技术', '工学_水利工程', '工学_测绘科学与技术', '工学_环境科学与工程', '工学_生物医学工程', '工学_生物工程', '工学_电子科学与技术', '工学_电气工程', '工学_石油与天然气工程', '工学_矿业工程', '工学_纺织科学与工程', '工学_网络空间安全', '工学_航空宇航科学与技术', '工学_船舶与海洋工程', '工学_计算机科学与技术', '工学_软件工程', '工学_轻工技术与工程', '工学_风景园林学', '工学_食品科学与工程', '教育学_体育学', '教育学_心理学', '教育学_教育学', '文学_中国语言文学', '文学_外国语言文学', '文学_新闻传播学', '法学_公安学', '法学_政治学', '法学_民族学', '法学_法学', '法学_社会学', '法学_马克思主义理论', '理学_化学', '理学_地球物理学', '理学_地理学', '理学_地质学', '理学_大气科学', '理学_天文学', '理学_数学', '理学_海洋科学', '理学_物理学', '理学_生态学', '理学_生物学', '理学_科学技术史', '理学_系统科学', '理学_统计学', '管理学_公共管理', '管理学_农林经济管理', '管理学_图书情报与档案管理', '管理学_工商管理', '管理学_管理科学与工程', '经济学_应用经济学', '经济学_理论经济学', '艺术学_戏剧与影视学', '艺术学_美术学', '艺术学_艺术学理论', '艺术学_设计学', '艺术学_音乐与舞蹈学'] + # 计算分数列的最高分学科 + score['HighScoreSubject'] = score[score_columns].apply(get_row_top, axis=1) + del_columns = ['军事学_军事后勤学与军事装备学', '军事学_军事思想及军事历史', '军事学_军事管理学', '军事学_军事训练学', '军事学_军队指挥学', '军事学_军队政治工作学', '军事学_战役学', '军事学_战术学', '军事学_战略学', '农学_作物学', '农学_兽医学', '农学_农业资源与环境', '农学_园艺学', '农学_林学', '农学_植物保护', '农学_水产', '农学_畜牧学', '农学_草学', '医学_中医学', '医学_中药学', '医学_中西医结合', '医学_临床医学', '医学_公共卫生与预防医学', '医学_医学技术', '医学_口腔医学', '医学_基础医学', '医学_护理学', '医学_特种医学', '医学_药学', '历史学_世界史', '历史学_中国史', '历史学_考古学', '哲学_哲学', '工学_交通运输工程', '工学_仪器科学与技术', '工学_信息与通信工程', '工学_光学工程', '工学_公安技术', '工学_兵器科学与技术', '工学_农业工程', '工学_冶金工程', '工学_力学', '工学_动力工程及工程热物理', '工学_化学工程与技术', '工学_土木工程', '工学_地质资源与地质工程', '工学_城乡规划学', '工学_安全科学与工程', '工学_建筑学', '工学_控制科学与工程', '工学_机械工程', '工学_材料科学与工程', '工学_林业工程', '工学_核科学与技术', '工学_水利工程', '工学_测绘科学与技术', '工学_环境科学与工程', '工学_生物医学工程', '工学_生物工程', '工学_电子科学与技术', '工学_电气工程', '工学_石油与天然气工程', '工学_矿业工程', '工学_纺织科学与工程', '工学_网络空间安全', '工学_航空宇航科学与技术', '工学_船舶与海洋工程', '工学_计算机科学与技术', '工学_软件工程', '工学_轻工技术与工程', '工学_风景园林学', '工学_食品科学与工程', '教育学_体育学', '教育学_心理学', '教育学_教育学', '文学_中国语言文学', '文学_外国语言文学', '文学_新闻传播学', '法学_公安学', '法学_政治学', '法学_民族学', '法学_法学', '法学_社会学', '法学_马克思主义理论', '理学_化学', '理学_地球物理学', '理学_地理学', '理学_地质学', '理学_大气科学', '理学_天文学', '理学_数学', '理学_海洋科学', '理学_物理学', '理学_生态学', '理学_生物学', '理学_科学技术史', '理学_系统科学', '理学_统计学', '管理学_公共管理', '管理学_农林经济管理', '管理学_图书情报与档案管理', '管理学_工商管理', '管理学_管理科学与工程', '经济学_应用经济学', '经济学_理论经济学', '艺术学_戏剧与影视学', '艺术学_美术学', '艺术学_艺术学理论', '艺术学_设计学', '艺术学_音乐与舞蹈学'] + # score.drop(del_columns, inplace=True) + for c in del_columns: + del score[c] + # 处理基础数据 + hebing_df = self.base_add_journal(score) + # hebing_df['LikeSubject'] = hebing_df['教育部一级学科']+hebing_df['HighScoreSubject']+hebing_df['HighScoreSubjectJournal'] + hebing_df['LikeSubject'] = hebing_df.apply(func, axis=1) + # hebing_df.drop(['issn'], inplace=True) + hebing_df.to_csv(f'./文章到学科{get_today()}.csv', index=False) + + def base_add_journal(self, base_table): + base_table = base_table[base_table['教育部一级学科'].notnull()] + new_df = pd.merge(left=base_table, right=self.journal_subject, how='left', left_on=['ISSN'], right_on=['issn']) + return new_df[new_df['HighScoreSubjectJournal'].notnull()] + # return new_df + + def execute(self): + self.periodical_top('../SubjectData/getScore:step1.csv') + self.tc_huang_subject('D:/Work/教育部学科数据2021年6月2日_processed.csv') + self.join_journal_subject() + self.score_model(f'./merge_table_{get_today()}.csv') + + +if __name__ == '__main__': + # merge_table('Z:/文章摘要推荐池/学科评分/toppaper核心数据/', './') + a2s = Article2Subject() + a2s.execute() diff --git a/article_subject/utils.py b/article_subject/utils.py new file mode 100644 index 0000000..072cb23 --- /dev/null +++ b/article_subject/utils.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/21 8:53 +# @Author : ZhaoXiangPeng +# @File : utils.py + +import pandas as pd +import datetime +import os + + +def get_today(fmt='%Y%m%d'): + return datetime.date.today().strftime(fmt) + + +def get_row_top(row: pd.Series, num: int = 3): + # print(row) + top_sub_list = row.sort_values(ascending=False)[:num].index + top_sub = [] + for sub in top_sub_list: + top_sub.append(sub.split('_')[-1]) + return ';'.join(top_sub) + + +def get_row_top_join_sub(row: pd.Series, num: int = 3, split: float = 0.9, split_on: bool = True): + if split_on: + new_row = row[row.values >= split] + row_len = len(new_row) + if row_len == 0: + num = 1 + elif row_len < num: + num = row_len + else: + num = num + top_sub_list = row.sort_values(ascending=False)[:num].to_dict() + top_sub = [] + for sub, score in top_sub_list.items(): + top_sub.append(f'{sub},{score}') + return '; '.join(top_sub) + + +def merge_table(filepath, output_path: str = None): + """ + filepath: 输入文件路径 + output_path: 如果不为空,则文件保存到此目录 + """ + if filepath[-1] != '/': + filepath += '/' + file_list = os.listdir(filepath) + return_df = pd.DataFrame() + for filename in file_list: + temp_df = pd.read_csv(filepath+filename) + return_df = pd.concat([return_df, temp_df]) + if output_path: + day = get_today() + return_df.to_csv(output_path+f'/merge_table_{day}.csv', index=False) + return return_df diff --git a/bcr/BCR_20240201.py b/bcr/BCR_20240201.py new file mode 100644 index 0000000..510d2dc --- /dev/null +++ b/bcr/BCR_20240201.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/2/2 10:45 +# @Author : zhaoxiangpeng +# @File : BCR_20240201.py + +import os +from copy import deepcopy +import pandas as pd +import data_process_tool + +from bcr.utils import read_file, str2float, str2int +from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH + +# ROOT_PATH = "Y:\\zhaoxiangpeng\\2024BCR" +# ROOT_PATH = "Y:\\zhaoxiangpeng\\BCR202403" +# ROOT_PATH = "Y:\\BCR\\202407" + +C_COLUMNS = ['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total', 'ISBN'] + + +def main(): + table = read_file(os.path.join(ROOT_PATH, 'MergeFile')) + + t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t') + table = pd.concat([table, t2]) + table.drop_duplicates(subset=['EID'], keep='last', inplace=True) + + # 把数量统计标准化 + table['2021'] = table['2021'].apply(str2float) + table['2022'] = table['2022'].apply(str2float) + table['2023'] = table['2023'].apply(str2float) + table['Grand Total'] = table['Grand Total'].apply(str2float) + step2_table = step2(table, export=True) + step3_table, no_data_table = step3(step2_table, export=True) + step4(no_data_table) + + +def process1(table: pd.DataFrame): + TABLE2 = deepcopy(table) + # 表头重命名 + # new_columns = data_process_tool.rename_head(TABLE2, postfix='-Other') + # TABLE2.rename(columns=new_columns, inplace=True) + + # 根据doi去重只保留一个用于doi匹配 + DOI_PROED = TABLE2.dropna(subset=['DOI']) + DOI_PROED.drop_duplicates(subset=['DOI'], inplace=True) + # 把doi为空的删掉,没有doi的用isbn匹配 + ISBN_PROED = TABLE2[TABLE2['DOI'].isnull()] + ISBN_PROED.drop_duplicates(subset=['ISBN'], inplace=True) + + return DOI_PROED, ISBN_PROED + + +def process_func2(table: pd.DataFrame): + """ + isbn分列 + """ + TABLE2 = deepcopy(table) + TABLE2['ISBN'] = TABLE2['ISBN'].astype(str) # 要转为str类型,不然会分不到 + ISBNs = TABLE2['ISBN'].str.split('; ', expand=True) + ISBNs = ISBNs.stack() # 把行转成列 + ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 + ISBNs.name = 'ISBN' + EID_PROED: pd.DataFrame = TABLE2.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs) + return EID_PROED + + +def process_func3(export: bool = True): + """ + 合并两个scopus表 + """ + keep_columns = [ + 'Title', 'Scopus ID', + 'Print ISBN', 'E-ISBN', 'Other ISBN', + 'Publication year', 'Publisher imprint', 'Publisher imprints grouped to main Publisher', + 'Classification 1', 'Classification 2', 'Classification 3', 'Classification 4', + ] + export_file_path = os.path.join(ROOT_PATH, "After\\4.两表字段合并.xlsx") + if not os.path.exists(export_file_path): + table1_path = os.path.join('Y:\\BCR\\202407', 'Scopusbooks04072023.xlsx') + table2_path = os.path.join('Y:\\BCR\\202407', 'Scopus Books June 2023新增书目3.9种及检索式.xlsx') + table1 = pd.read_excel(table1_path, sheet_name=0) + table1 = table1[keep_columns] + table2 = pd.read_excel(table2_path, sheet_name=0) + table2 = table2[keep_columns] + table0 = pd.concat([table1, table2]) + table0.drop_duplicates(subset=['Print ISBN', 'E-ISBN', 'Other ISBN'], keep='last', inplace=True) + table0['Scopus ID'] = table0['Scopus ID'].astype(str) + if export: + table0.to_excel(export_file_path, index=False) + else: + table0 = pd.read_excel(export_file_path, sheet_name=0) + return table0 + + +def step2(table: pd.DataFrame, export: bool = True): + """ + ppt第二个需求 + """ + group_by = table.groupby(by=['ISBN'])['2021', '2022', '2023', 'Grand Total'].sum() + group_by.reset_index(inplace=True) + + keep_columns = table[['DOI', 'ISBN']] + keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) + + result_table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) + if export: + result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False) + + return result_table + + +def step3(table: pd.DataFrame, export: bool = True): + """ + ISBN合并记录与上一年BCR总表匹配 + 按DOI、ISBN的顺序匹配,匹配到的记录,二个表的字段合并 + """ + BASE_FILE = pd.read_excel("Y:\\BCR\\2024BCR\\BCR2023数据处理\\副本BCR2022总表-20220729.xlsx", + sheet_name=0) # 上一年的BCR总表 + # 表头加标记 + new_columns = data_process_tool.rename_head(BASE_FILE, postfix='-Other') + BASE_FILE.rename(columns=new_columns, inplace=True) + + doi_table, isbn_table = process1(table) + doi_ = pd.merge(doi_table, BASE_FILE, how='left', left_on=['DOI'], right_on=['DOI-Other']) + """ + # 把doi分成有数据的和没数据的 + has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据 + no_data = doi_[doi_['DOI-Other'].isnull()] # 使用doi没有匹配到的 + # del doi_ + no_data = no_data[table.columns.values.tolist()] # 把没有数据的多余列去除 + """ + # 用没有匹配到doi的数据用isbn进行匹配 + isbn_ = pd.merge(isbn_table, BASE_FILE, how='left', left_on=['ISBN'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了 + # 合并doi匹配结果和isbn的结果 + result_table = pd.concat([doi_, isbn_]) + if export: + result_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR匹配结果.xlsx'), index=False) + + # 通过doi和isbn都没有匹配到的 + all_no_data = result_table[result_table['ISBN-Other'].isnull()] + all_no_data = all_no_data[table.columns.values.tolist()] # 保留基础列 + if export: + all_no_data.to_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), index=False) + + return result_table, all_no_data + + +def step4(table: pd.DataFrame, export: bool = True): + """ + ISBN合并记录与上一年BCR总表不匹配记录处理 + 与SCOPUS来源书目匹配 + 把二个表的ISBN分列,进行交叉匹配 + 把二个表的字段进行合并 + 再与OASIS记录匹配 + 获取作者、学科分类数据 + 删除敏感书目 + """ + df1 = process_func2(table) # 不匹配记录 + df1.drop_duplicates(subset=['ISBN RAW', 'ISBN'], inplace=True) + df1['ISBN'] = df1['ISBN'].astype(str) + print(df1) + + df2 = process_func3(export=export) + for col in ['Print ISBN', 'E-ISBN', 'Other ISBN']: + df2[col] = df2[col].astype(str) + + c1 = pd.merge(df1, df2, left_on=['ISBN'], right_on=['Print ISBN'], how='left') + c1_in = c1[c1['Print ISBN'].notnull()] + c1_not = c1[c1['Print ISBN'].isnull()] + c1_not = c1_not[C_COLUMNS] + + c2 = pd.merge(c1_not, df2, left_on=['ISBN'], right_on=['E-ISBN'], how='left') + c2_in = c2[c2['E-ISBN'].notnull()] + c2_not = c2[c2['E-ISBN'].isnull()] + c2_not = c2_not[C_COLUMNS] + + c3 = pd.merge(c2_not, df2, left_on=['ISBN'], right_on=['Other ISBN'], how='left') + + c3_in = c3[c3['Other ISBN'].notnull()] + c3_not = c3[c3['Other ISBN'].isnull()] + + # 3次匹配结果合并 + r1_in = pd.concat([c1_in, c2_in, c3_in]) + r1_in.drop_duplicates(subset=['ISBN RAW'], inplace=True) + + r1_not = c3_not + r1_not = pd.concat([r1_not, r1_in, r1_in]).drop_duplicates(subset=['ISBN RAW'], keep=False) + r1_not = r1_not[['DOI', 'ISBN RAW', '2022', '2023', '2024', 'Grand Total']] + r1_not.rename(columns={'ISBN RAW': 'ISBN'}, inplace=True) + if export: + r1_in.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目匹配.xlsx'), index=False) + r1_not.to_excel(os.path.join(ROOT_PATH, 'RESULT\\4.与SCOPUS来源书目未匹配到.xlsx'), index=False) + + +if __name__ == '__main__': + main() + """ + step2_table = pd.read_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), sheet_name=0) + step3_table, no_data_table = step3(step2_table, export=True) + step4(no_data_table) + """ + # ste3_table = pd.read_excel(os.path.join(ROOT_PATH, 'RESULT\\3.BCR未匹配到.xlsx'), sheet_name=0) + # step4(ste3_table) diff --git a/bcr/BCR_20240311.py b/bcr/BCR_20240311.py new file mode 100644 index 0000000..66c4f2c --- /dev/null +++ b/bcr/BCR_20240311.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/3/11 13:48 +# @Author : zhaoxiangpeng +# @File : BCR_20240311.py + +import os +import pandas as pd +from loguru import logger + +ADD_SOURCE = True +BASE_PATH = 'Y:\\BCR\\202407' + + +def load_all_small_file(path: str): + """加载所有的小文件""" + dirs = os.listdir(path) + for dir_ in dirs: + path1 = os.path.join(path, dir_) + files = os.listdir(path1) + for file in files: + full_file_path = os.path.join(path1, file) + yield full_file_path + + +def step0(): + gg = load_all_small_file(os.path.join(BASE_PATH, "API分工原始采集记录")) + big_table = pd.DataFrame() + for file_path in gg: + logger.debug('当前处理 %s' % file_path) + table = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, index_col=False) + if ADD_SOURCE: + simple_name = os.path.basename(file_path) + simple_name = simple_name + table['SOURCE'] = file_path + # columns = table.columns.values.tolist() + logger.debug('表头: %s' % table.columns.values.tolist()) + big_table = pd.concat([big_table, table]) + start = 0 + split = 1000000 + row, col = big_table.shape + file_idx = 1 + for x in range(start, row, split): + table = big_table[x: x + split] + save_path = os.path.join(BASE_PATH, "After") + table.to_csv(os.path.join(save_path, '%s.txt' % file_idx), sep='\t', index=False) + table.to_excel(os.path.join(save_path, '%s.xlsx' % file_idx), index=False) + file_idx += 1 + + +if __name__ == '__main__': + step0() + + diff --git a/bcr/BCR_20240426.py b/bcr/BCR_20240426.py new file mode 100644 index 0000000..5a5ade7 --- /dev/null +++ b/bcr/BCR_20240426.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/4/26 14:15 +# @Author : zhaoxiangpeng +# @File : BCR_20240426.py + +import os +import pandas as pd +from loguru import logger + +ROOT_PATH = 'Y:\\zhaoxiangpeng\\BCR202404' + + +def func1(): + """ + 用22年的BCR填充24年错误的数据 + """ + bcr22_path = 'Y:\\zhaoxiangpeng\\BCR2022' + bcr22_files = ['eid去重保留最大grandTotal-1.xlsx', 'eid去重保留最大grandTotal-2.xlsx', + 'eid去重保留最大grandTotal-3.xlsx'] + # 分片的旧文件合并 + bcr22_table = pd.read_csv(os.path.join(bcr22_path, 'eid去重保留最大grandTotal.csv')) + bcr22_table = bcr22_table[['EID', '2021']] + bcr22_table.drop_duplicates(subset=['EID'], inplace=True) + """ + for bcr22_file in bcr22_files: + temp_file = pd.read_excel(os.path.join(bcr22_path, bcr22_file), engine='openpyxl', sheet_name=0) + temp_file = temp_file[['EID', '2021']] + bcr22_table = pd.concat([bcr22_table, temp_file]) + """ + # 24年补充的数据 + bcr24_extend = pd.read_csv(os.path.join(ROOT_PATH, 'Grand Total为空-20240410 11时06分下载.csv'), index_col=False) + table_head = bcr24_extend.columns.values.tolist() + new_table = pd.merge(bcr24_extend, bcr22_table, how='left', on=['EID']) + new_table = new_table[['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', + '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', + '通讯地址', + '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言', + '来源出版物名称缩写', '文献类型', '出版阶段', '访问类型', '来源出版物', 'EID', 'Sort Year', + '2021', '2022', '2023', '2024', 'Grand Total']] + print(new_table) + new_table.to_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t', index=False) + + +if __name__ == '__main__': + func1() diff --git a/bcr/BCR_20240724.py b/bcr/BCR_20240724.py new file mode 100644 index 0000000..9ec3b24 --- /dev/null +++ b/bcr/BCR_20240724.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/24 20:03 +# @Author : zhaoxiangpeng +# @File : BCR_20240724.py + +import os +import pandas as pd + +from bcr.utils import read_file, str2float, str2int +from bcr.BCR_20240201 import step2, step3, step4 +from bcr.BCR_20240201 import main, ROOT_PATH +from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH + +c2 = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '文献类型', 'DOI', 'ISBN', 'EID', + 'Sort Year', '2021', '2022', '2023', '2024', 'Grand Total'] + + +def step2_change(table: pd.DataFrame, reduce_columns: list = None, keep_columns: list = None, export: bool = True): + """ + ppt第二个需求修改 + """ + # 2024/12/25 14:58 修改,增加了reduce_columns参数用来替换固定值 + if reduce_columns is None: + reduce_columns = ['2021', '2022', '2023', 'Grand Total'] + if keep_columns is None: + keep_columns = c2 + # 处理数值类型 + for col in reduce_columns: + table[col] = table[col].apply(str2float) + # 正常聚合 + # 1.求和结果 + agg_result = table.groupby(by=['ISBN'])[reduce_columns].sum() + agg_result.reset_index(inplace=True) # 重置索引 + # 2.分块 + filter_table_is = table[table["文献类型"] == "Book"] + filter_table_not = table[table["文献类型"] != "Book"] + # 3.分别去重 + filter_table_is.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) + filter_table_not.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) + # 4.合并去重保留是Book的,book的在上面,重复项保留上面的 + merge_table = pd.concat([filter_table_is, filter_table_not]) + merge_table.drop_duplicates(subset=['ISBN'], keep='first', inplace=True) + # 5.删除多于列 + merge_table.drop(reduce_columns, axis=1, inplace=True) + # 重新匹配 + result = pd.merge(merge_table, agg_result, how='left', left_on=['ISBN'], right_on=['ISBN']) + result_table = result[keep_columns] + result['年份'] = result['年份'].astype(str) + result['Sort Year'] = result['Sort Year'].astype(str) + """ + # 新增的需求 + # 以ISBN聚合,重复项保留 + big_table = pd.DataFrame() + group_by = table.groupby(by=['ISBN']) + for _, group in group_by: + agg: pd.Series = group[reduce_columns].sum() + group_filter = group[group["文献类型"] == "Book"] + if group_filter.empty: + first = group[:1] + # total求和 + else: + first = group_filter[:1] + # 替换聚合的值 + first[reduce_columns] = agg + big_table = pd.concat([big_table, first]) + + group_by.reset_index(inplace=True) + """ + if export: + result_table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx"), index=False) + + return result_table + + +def main_change(): + table = read_file(os.path.join(ROOT_PATH, 'MergeFile')) + # 测试 + # table = pd.read_csv(os.path.join(ROOT_PATH, 'MergeFile\\3.txt'), sep='\t') + + t2 = pd.read_csv(os.path.join(ROOT_PATH, '补充数据填充2021年total.txt'), sep='\t') + table = pd.concat([table, t2]) + table.drop_duplicates(subset=['EID'], keep='last', inplace=True) + + # 把数量统计标准化 + table['2021'] = table['2021'].apply(str2float) + table['2022'] = table['2022'].apply(str2float) + table['2023'] = table['2023'].apply(str2float) + table['Grand Total'] = table['Grand Total'].apply(str2float) + step2_table = step2_change(table, export=True) + # step3_table, no_data_table = step3(step2_table, export=True) + # step4(no_data_table) + + +def change_field_type(): + table = pd.read_excel('Y:\\BCR\\202407\\RESULT\\2.统计ISBN使用量(保留DOI).xlsx', sheet_name=0, engine='openpyxl') + table['年份'] = table['年份'].apply(str2int) + table['Sort Year'] = table['Sort Year'].apply(str2int) + table.to_excel(os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI)2.xlsx"), index=False) + + +if __name__ == '__main__': + main_change() + # change_field_type() diff --git a/bcr/BCR_20241224.py b/bcr/BCR_20241224.py new file mode 100644 index 0000000..313bc1b --- /dev/null +++ b/bcr/BCR_20241224.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/12/24 15:03 +# @Author : zhaoxiangpeng +# @File : BCR_20241224.py + +import os +import re +import warnings +import chardet +import pandas as pd +from loguru import logger +from bcr.utils import read_file, str2float, export_small_file +import bcr.BCR_20240724 as bcr_20240724 +import bcr.BCR_20240201 as bcr_20240201 +from config import KEEP_COLUMNS, REDUCE_COLUMNS, ROOT_PATH + + +def task_change1(base_table: pd.DataFrame = None) -> pd.DataFrame: + """ + 补充失败的记录重新采集 + """ + extend_table = pd.read_excel(os.path.join(ROOT_PATH, 'BCR2024书目补采API.xlsx'), engine='openpyxl', sheet_name=0) + if isinstance(base_table, pd.DataFrame): + # 主表只保留eid用来对补数据的表进行去重 + dup_table = base_table[['EID']] + dup_table.drop_duplicates(subset=['EID'], inplace=True) + # eid列改名,防止有冲突 + dup_table.rename(columns={'EID': 'dup_eid'}, inplace=True) + # 扩展表的EID和主表的dup_eid列进行左连接,结果表dup_eid为空的的就是需要补充的行 + duped_table = extend_table.merge(right=dup_table, how='left', left_on=['EID'], right_on=['dup_eid']) + duped_table = duped_table[duped_table['dup_eid'].isnull()] + duped_table.drop(columns=['dup_eid'], inplace=True) + # 删除用来匹配的列 + all_data_table = pd.concat([base_table, duped_table]) + return all_data_table + return extend_table + + +def step1_merge(): + path = 'Y:\\BCR\\2025BCR' + path2 = os.path.join(path, 'MergeFile') + files = os.listdir(path2) + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path2, file) + small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0) + # small_table = small_table[['EID']] + print(small_table.shape) + big_table = pd.concat([big_table, small_table]) + small_table = pd.read_csv(r'Y:\BCR\BCR202412\补采1-20241127 13时37分下载(1).csv') + big_table = pd.concat([big_table, small_table]) + return big_table + + +def step1_merge_change(): + """ + 处理补采的文件 + """ + path2 = os.path.join(ROOT_PATH, 'RESULT\文件和并结果') + files = os.listdir(path2) + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path2, file) + small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0) + big_table = pd.concat([big_table, small_table]) + return task_change1(big_table) + + +def step2_change(table: pd.DataFrame, export: bool = True): + # 正常聚合 + # 1.求和结果 + # 求和前要先把数字类型给统一了 + table['2021'] = table['2021'].apply(str2float) + table['2022'] = table['2022'].apply(str2float) + table['2023'] = table['2023'].apply(str2float) + table['Grand Total'] = table['Grand Total'].apply(str2float) + # 把相同ISBN的记录合并成一条记录,多条记录的各年份和GrandTotal引用次数求和 + agg_result = table.groupby(by=['ISBN'])[['2021', '2022', '2023', 'Grand Total']].sum() + agg_result.reset_index(inplace=True) # 重置索引 + # 2.分块 + filter_table_is = table[table["文献类型"] == "Book"] + filter_table_not = table[table["文献类型"] != "Book"] + filter_table_is[KEEP_COLUMNS] + + +def main(): + STEP_IS_EXIST = True + if STEP_IS_EXIST: + table = step1_merge_change() + + # 判断表2的结果是否存在的逻辑 + step_2_table_path = os.path.join(ROOT_PATH, "RESULT\\2.统计ISBN使用量(保留DOI).xlsx") + if not os.path.exists(step_2_table_path): + step2_table = bcr_20240724.step2_change(table, reduce_columns=REDUCE_COLUMNS, keep_columns=KEEP_COLUMNS, + export=True) + else: + step2_table = pd.read_excel(step_2_table_path, sheet_name=0) + + # 第三步表结果是否存在的逻辑 + no_data_table_path = os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx') + if not os.path.exists(no_data_table_path): + step3_table, no_data_table = bcr_20240201.step3(step2_table, export=True) + else: + no_data_table = pd.read_excel(os.path.join(ROOT_PATH, r'RESULT\3.BCR未匹配到.xlsx'), sheet_name=0) + + # 处理第4步 + bcr_20240201.step4(no_data_table) + + +if __name__ == '__main__': + main() diff --git a/bcr/__init__.py b/bcr/__init__.py new file mode 100644 index 0000000..3857507 --- /dev/null +++ b/bcr/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/12/1 14:20 +# @Author : ZAOXG +# @File : __init__.py.py diff --git a/bcr/api记录匹配.py b/bcr/api记录匹配.py new file mode 100644 index 0000000..3211595 --- /dev/null +++ b/bcr/api记录匹配.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/12/1 16:48 +# @Author : ZAOXG +# @File : api记录匹配.py + +import data_process_tool +import pandas as pd + + +def step1(): + # 合并小文件 + data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API采集-BCR2022相同记录\API采集', on_columns=['EID'], encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip') + data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2', on_columns=['EID'], encoding_errors='ignore', on_bad_lines='skip') + + +def step2(): + record1 = data_process_tool.read_data(root_path+'2023记录.csv') + record1.drop_duplicates(inplace=True) + record1.rename(columns={'EID': '原始记录'}, inplace=True) + record2 = data_process_tool.read_data(root_path+'API采集.csv') + record2.drop_duplicates(inplace=True) + record3 = pd.merge(record2, record1, how='left', left_on=['EID'], right_on=['原始记录']) + print(record3) + error_record = record3[record3['EID'].isna()] + error_record.to_csv(root_path+'未匹配到记录2-EID.csv', index=False) + # error_record.to_excel(root_path+'未匹配到记录-EID.xlsx', index=False) + + +def step3(): + record1 = data_process_tool.read_data(root_path + '2023原始记录.csv') + record1.drop_duplicates(subset=['EID'], inplace=True) + # 对每个表的列名做标记 + record1_rename = {} + for r1_name in record1.columns: + record1_rename[r1_name] = r1_name + '(2023记录)' + record1.rename(columns=record1_rename, inplace=True) + + record2 = data_process_tool.read_data(root_path + 'API下载记录.csv') + record2.drop_duplicates(subset=['EID'], inplace=True) + record2_rename = {} + for r2_name in record2.columns: + record2_rename[r2_name] = r2_name + '(API记录)' + record2.rename(columns=record2_rename, inplace=True) + + # 左连接找出右表缺失字段 + record3 = pd.merge(record2, record1, how='left', left_on=['EID(API记录)'], right_on=['EID(2023记录)']) + print(record3) + error_record = record3[record3['EID(2023记录)'].isna()] + error_record.to_excel(root_path + 'API下载记录有2023原始记录无.xlsx', index=False) + + record4 = pd.merge(record1, record2, how='left', left_on=['EID(2023记录)'], right_on=['EID(API记录)']) + print(record4) + error_record2 = record4[record4['EID(API记录)'].isna()] + error_record2.to_excel(root_path + '2023原始记录有API下载记录无.xlsx', index=False) + + +if __name__ == '__main__': + root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' + # step2() + data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API采集', to_type='csv', + encoding='GB2312', encoding_errors='ignore', on_bad_lines='skip') + # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录', encoding_errors='ignore', + # on_bad_lines='skip') + # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\2023原始记录\2022-11-21-下载记录', encoding_errors='ignore', + # on_bad_lines='skip') + # data_process_tool.merge_table(fr'F:\工作数据存储2022\20221201_bcrAPI对比\API下载记录\API失败记录重新下载采集', + # to_type='csv', + # encoding='GB2312', encoding_errors='ignore', + # on_bad_lines='skip') + # step3() + diff --git a/bcr/bcr记录保留多列.py b/bcr/bcr记录保留多列.py new file mode 100644 index 0000000..c7bd10a --- /dev/null +++ b/bcr/bcr记录保留多列.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/1/17 14:06 +# @Author : zhaoxiangpeng +# @File : bcr记录保留多列.py + +import data_process_tool +import pandas as pd + + +def func1(): + """ + 包留DOI, 来源出版物,来源出版物缩写 + """ + record1 = data_process_tool.read_data(root_path + 'eid去重.csv') + # 不为数字的列转为0 + # 删除有问题的行 + all_api_record = record1[['ISBN', '2020', '2021', '2022', 'GrandTotal']] + all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) + all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 + all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float + group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() + group_by.reset_index(inplace=True) + # group_by.to_csv('.....csv') # 如果需要保存... + # 取需要保留的列并去重只保留一个 + keep_columns = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN']] + keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) + + table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) + print(table) + # table.to_csv(root_path+'统计ISBN使用量(保留来源出版物等字段).csv', index=False) + table.to_excel(root_path+'统计ISBN使用量(保留来源出版物等字段).xlsx', index=False) + + +def func2(): + """ + 将ISBN列分割为单个 + """ + record1 = data_process_tool.read_data(root_path + 'eid去重.csv') + # 保留需要的列 + all_api_record = record1[['DOI', '来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN', '2020', '2021', '2022', 'GrandTotal']] + + ISBNs = all_api_record['ISBN'].str.split('; ', expand=True) + ISBNs = ISBNs.stack() # 把行转成列 + ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 + ISBNs.name = 'ISBN' + all_api_record = all_api_record.drop(['ISBN'], axis=1).join(ISBNs) + # 也要处理一下有问题的哪行 + all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) + all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 + all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float + + # 分组 + group_by = all_api_record.groupby(by=['来源出版物名称', '出版商', '来源出版物名称缩写', 'ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() + group_by.reset_index(inplace=True) + + keep_columns = all_api_record[['DOI', 'ISBN']] + keep_columns.drop_duplicates(keep='first', subset=['ISBN'], inplace=True) + + table = pd.merge(left=keep_columns, right=group_by, how='right', on=['ISBN']) + print(table) + # table.to_csv(root_path + '统计ISBN使用量(ISBN分割).csv', index=False) + table.to_excel(root_path + '统计ISBN使用量(ISBN分割).xlsx', index=False) + + +if __name__ == '__main__': + root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' + func1() + func2() + diff --git a/bcr/bcr记录合并.py b/bcr/bcr记录合并.py new file mode 100644 index 0000000..df7621e --- /dev/null +++ b/bcr/bcr记录合并.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/12/14 14:46 +# @Author : zhaoxiangpeng +# @File : api.py + +import data_process_tool +import pandas as pd +import numpy as np +import re + + +def step1(): + """ + EID去重保留gratetotal更大值的行 + """ + record1 = data_process_tool.read_data(root_path + 'API下载记录.csv') + record2 = data_process_tool.read_data(root_path + 'API失败记录重新下载采集.csv') + record3 = data_process_tool.read_data(root_path + '2023原始记录.csv') + # 失败记录 对比 api记录中缺失的列 + api_chaji = {'归属机构', 'CODEN', '访问类型', '带归属机构的作者', '来源出版物名称缩写', '通讯地址', 'PubMed ID', + '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'} + # 失败记录 对比 原始记录中缺失的列 + raw_api_chaji = {'EID', '归属机构', 'Author full names', '文献标题', 'CODEN', '访问类型', '带归属机构的作者', + '来源出版物名称缩写', '通讯地址', 'PubMed ID', '原始文献语言', 'ISSN', '出版商', '编者', 'ISBN'} + print(record2) + # 把 失败记录缺失的列在原始记录中补充 + record3 = record3[list(raw_api_chaji)] + record3.rename(columns={'EID': 'EID_copy'}, inplace=True) + temp_ = pd.merge(record2, record3, how='left', left_on=['EID'], right_on=['EID_copy']) + # 只保留与api记录相同的列 + record2 = temp_[record1.columns.values.tolist()] + # api记录与失败记录合并 + all_api_record = pd.concat([record1, record2]) + + # 保留GrandTotal最大值 的EID,换个思路,把 GrandTotal 列排序,对EID去重保留最后一个 + all_api_record.sort_values(by=['GrandTotal'], inplace=True) + all_api_record.drop_duplicates(subset=['EID'], keep='last', inplace=True) # 去重后的文件 + + # 不为数字的列转为0 + # 删除有问题的行 + all_api_record.drop(all_api_record[all_api_record['2020'] == '2-s2.0-84971016798'].index, inplace=True) + all_api_record['2020'].fillna(0, inplace=True) # 把空行换为0 + all_api_record['2020'] = all_api_record['2020'].astype(float) # 类型转为float + # all_api_record['2020'] = all_api_record['2020'].apply(lambda x: x if re.search("^\d+$", str(x)) else np.nan) + # 对ISBN + group_by = all_api_record.groupby(by=['ISBN'])['2020', '2021', '2022', 'GrandTotal'].sum() + group_by.to_excel(root_path + 'eid去重grandTotal合并.xlsx') + + +def is_float(data): + try: + return float(data) + except Exception: + print(data) + return 0 + + +if __name__ == '__main__': + root_path = 'F:/工作数据存储2022/20221201_bcrAPI对比/合并结果/' + step1() diff --git a/bcr/config.py b/bcr/config.py new file mode 100644 index 0000000..e795dc1 --- /dev/null +++ b/bcr/config.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/12/26 8:57 +# @Author : zhaoxiangpeng +# @File : config.py + +ROOT_PATH = r'Y:\BCR\BCR202412' # 根路径,输入输出的路径 +KEEP_COLUMNS = ['作者', '作者 ID', '文献标题', '年份', '来源出版物名称', '施引文献', 'DOI', '链接', '归属机构', '带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', 'EID', 'Sort Year', '2022', '2023', '2024', 'Grand Total'] # 要保存的列 +REDUCE_COLUMNS = ['2022', '2023', '2024', 'Grand Total'] diff --git a/bcr/record_20230524.py b/bcr/record_20230524.py new file mode 100644 index 0000000..eaaac6e --- /dev/null +++ b/bcr/record_20230524.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/5/24 9:36 +# @Author : zhaoxiangpeng +# @File : record_20230524.py +# 2023-05-24 BCR处理 +# 需求 +# 根据doi、isbn补充eid去重表的字段;优先doi,其次isbn;出现重复时只保留一个 +# 实现 +# 1.先把大文件(eid去重表)根据isbn和eid去重,isbn是多个要先分开后再进行去重,只保留一个 +# 2.先对doi进行连接,把没有匹配到的用isbn进行连接,之后两次连接合并 + +# 缓存文件y盘有 + +import os +import openpyxl +from typing import Union, Tuple +import pandas as pd +import data_process_tool + +BASE_PATH = 'F:/工作数据存储2023/20230517_一年一度的BCR/' +HEAD = ['作者', '作者 ID', '标题', '年份', '来源出版物名称', '卷', '期', '论文编号', '起始页码', '结束页码', '页码计数', '施引文献', 'DOI', '链接', '归属机构', + '带归属机构的作者', '通讯地址', '编者', '出版商', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', '原始文献语言', '来源出版物名称缩写', '文献类型', '出版阶段', + '开放获取', '来源出版物', 'EID', 'Scopus ID', 'Title', 'Author', 'Author ID', 'Sort Year', '2020', '2021', '2022', + 'GrandTotal', '访问类型'] + + +def slice_read(path) -> pd.DataFrame: + + # 打开Excel文件,使用read_only模式 + workbook = openpyxl.load_workbook(filename=path, read_only=True) + + # 使用worksheet.iter_rows()方法读取单元格数据 + # 获取第一个worksheet对象 + worksheet = workbook.worksheets[0] + dataArray = [] + # 遍历单元格并读取数据 + for row in worksheet.iter_rows(max_col=40, values_only=True): + dataArray.append(row) + # 关闭Workbook对象 + workbook.close() + + table = pd.DataFrame(data=dataArray, columns=HEAD) + return table + + +def chunk_read(): + pd.read_csv() + + +def step0() -> pd.DataFrame: + """合并 eid去重保留最大grandTotal 压缩文件""" + cache = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv') + if os.path.exists(cache): + return data_process_tool.read_data(cache) + COMPRESS_PATH = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal-csv') + compress_files = os.listdir(COMPRESS_PATH) + big_join_file = pd.DataFrame() + compress_files.pop() + + for compress_file in compress_files: + data = data_process_tool.read_data( + os.path.join(COMPRESS_PATH, compress_file), + # low_memory=False + ) + big_join_file = pd.concat([big_join_file, data]) + # big_join_file.append( + # data_process_tool.read_data( + # os.path.join(COMPRESS_PATH, compress_file) + # ) + # ) + data = slice_read(os.path.join(BASE_PATH, 'eid去重保留最大grandTotal/eid去重保留最大grandTotal-3.xlsx')) + big_join_file = pd.concat([big_join_file, data]) + big_join_file.to_csv(cache, index=False) + return big_join_file + + +def step1() -> Tuple[pd.DataFrame, pd.DataFrame]: + """处理大文件""" + EID_PROED: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'EID_processed.csv') + if not os.path.exists(EID_PROED): + + EID_PROED_CACHE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, 'eid去重保留最大grandTotal.csv') + if os.path.exists(EID_PROED_CACHE): + EID_PROED_CACHE: pd.DataFrame = data_process_tool.read_data(EID_PROED_CACHE) + else: + EID_PROED_CACHE: pd.DataFrame = step0() + ISBNs = EID_PROED_CACHE['ISBN'].str.split('; ', expand=True) + ISBNs = ISBNs.stack() # 把行转成列 + ISBNs = ISBNs.reset_index(level=1, drop=True) # 重置索引, 并删除多余的索引 + ISBNs.name = 'ISBN' + EID_PROED: pd.DataFrame = EID_PROED_CACHE.rename(columns={'ISBN': 'ISBN RAW'}).join(ISBNs) + # 对表头进行重命名 + new_columns = data_process_tool.rename_head(EID_PROED, postfix='-Other') + EID_PROED.rename(columns=new_columns, inplace=True) + # 缓存一下 + EID_PROED.to_csv(os.path.join(BASE_PATH, 'EID_processed.csv'), index=False) + else: + EID_PROED = data_process_tool.read_data(EID_PROED) + # 根据doi去重只保留一个用于doi匹配 + DOI_PROED = EID_PROED.dropna(subset=['DOI-Other']) + DOI_PROED.drop_duplicates(subset=['DOI-Other'], inplace=True) + + # 根据isbn去重保留一个用于剩下的做ISBN匹配 + ISBN_PROED = EID_PROED.dropna(subset=['ISBN-Other']) + ISBN_PROED.drop_duplicates(subset=['ISBN-Other'], inplace=True) + + return DOI_PROED, ISBN_PROED + + +def step2(): + BASE_FILE: Union[str, pd.DataFrame] = os.path.join(BASE_PATH, '副本BCR2022总表-20220729.xlsx') + BASE_FILE = data_process_tool.read_data(BASE_FILE) + doi_table, isbn_table = step1() + doi_ = pd.merge(BASE_FILE, doi_table, how='left', left_on=['DOI'], right_on=['DOI-Other']) + # 把doi分成有数据的和没数据的 + has_data = doi_[doi_['DOI-Other'].notnull()] # 匹配到数据 + no_data = doi_[doi_['DOI-Other'].isnull()] # 没有匹配到 + del doi_ + no_data = no_data[BASE_FILE.columns.values.tolist()] # 把没有数据的多余列去除 + # 用没有匹配到doi的数据用isbn进行匹配 + isbn_ = pd.merge(no_data, isbn_table, how='left', left_on=['ISBN.1'], right_on=['ISBN-Other']) # 这些就不用考虑没有匹配到的了,因为没有剩下的条件了 + # 合并doi匹配结果和isbn的结果 + result_table = pd.concat([has_data, isbn_]) + result_table.to_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv'), index=False) + + +def step3(): + """to_excel""" + df = pd.read_csv(os.path.join(BASE_PATH, 'BCR匹配结果.csv')) + df.to_excel(os.path.join(BASE_PATH, 'BCR匹配结果.xlsx'), index=False) + + +def main(): + # step0() + step2() + # step3() + + +if __name__ == '__main__': + main() + # slice_read() diff --git a/bcr/test.py b/bcr/test.py new file mode 100644 index 0000000..494448e --- /dev/null +++ b/bcr/test.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/7/25 15:33 +# @Author : zhaoxiangpeng +# @File : test.py + +import pandas as pd + +table = pd.DataFrame([ + {"A": 1, "B": 2, "C": 3, "D": 2, "T": "X"}, + {"A": 2, "B": 3, "C": 4, "D": 1, "T": "Y"}, + {"A": 3, "B": 4, "C": 1, "D": 2, "T": "X"}, + {"A": 4, "B": 1, "C": 2, "D": 3, "T": "Z"}, +]) +print(table) +print('-0'*50) +big_table = pd.DataFrame() +group_by = table.groupby(by=['T']) +for gn, group in group_by: + print('gn:', gn) + print(group) + a = group[["A", "B", "C", "D"]].sum() + print(a) + f = group[group['D'] == 2] + + if f.empty: + first = group.head(1) + else: + first = f.head(1) + first.loc[:, ("A", "B", "C", "D")] = a + print(first) + big_table = pd.concat([big_table, first]) + + print('-0-' * 50) + +print(big_table) diff --git a/bcr/utils.py b/bcr/utils.py new file mode 100644 index 0000000..99bf891 --- /dev/null +++ b/bcr/utils.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/2/4 10:55 +# @Author : zhaoxiangpeng +# @File : utils.py + +import os +from typing import List, Union +import pandas as pd + + +def read_file(path_or_files: Union[List[str], str], path: bool = True): + if path and isinstance(path_or_files, str): + path_or_files = [os.path.join(path_or_files, file) for file in os.listdir(path_or_files)] + big_table = pd.DataFrame() + for file in path_or_files: + table = pd.read_csv(file, sep='\t', low_memory=False) + big_table = pd.concat([big_table, table]) + return big_table + + +def export_small_file(big_table, export_path: str = None, split: int = int(8e5)): + """ + 大的表导出为小的表 + """ + row, col = big_table.shape + file_idx = 1 + for x in range(0, row, split): + table = big_table[x: x + split] + table.to_excel(os.path.join(export_path, '%s.xlsx' % file_idx), index=False) + file_idx += 1 + + +def str2float(string, replace=0): + try: + val = float(string) + except ValueError: + val = replace + except TypeError: + val = replace + return val + + +def str2int(string, replace=0): + try: + val = int(string) + except ValueError: + val = replace + except TypeError: + val = replace + return val diff --git a/bcr/合并小文件.py b/bcr/合并小文件.py new file mode 100644 index 0000000..72aa079 --- /dev/null +++ b/bcr/合并小文件.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/2/1 17:10 +# @Author : zhaoxiangpeng +# @File : 合并小文件.py + +import os +import re +import warnings +import chardet +import pandas as pd +from loguru import logger + + +def read_standard(filename): + table = pd.read_table(filename, encoding_errors='ignore', on_bad_lines='skip', low_memory=False) + return table + + +def merge_files(path): + big_table = pd.DataFrame() + files = os.listdir(path) + for file in files: + file_path = os.path.join(path, file) + table = read_standard(file_path) + big_table = pd.concat([big_table, table]) + + big_table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR", '2024BCR总表.csv'), sep='\t', index=False) + + +def read_file(file_path, encoding: str = 'gbk', error: bool = False): + if not error: + f = open(file_path, encoding=encoding) + else: + warnings.warn('%s 编码异常,启用检查' % file_path) + check = open(file_path, 'rb') + data = check.read() + info = chardet.detect(data) + encoding = info['encoding'] + kwargs = {} + kwargs.update(encoding=encoding) + warnings.warn('%s 尝试使用 "%s" 解码' % (file_path, encoding)) + f = open(file_path, **kwargs) + code = encoding + return f, encoding + + +def merge_files_by_row(path): + """ + 通过行读取的方式把小文件处理为标准的单个100,000条的文件 + """ + ERROR_FILE = ['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv'] + data_column_count = 35 + files = os.listdir(path) + decode_table = dict() + split_str = '\t' + documents = [] + document_count = 0 + file_seq = 1 + for file in files: + + if file in ERROR_FILE: + split_str = ',' + logger.warning("文件可能被修改过, 跳过 %s" % file) + continue + else: + split_str = '\t' + file_path = os.path.join(path, file) + logger.info('处理 %s' % file_path) + f, code = read_file(file_path) + try: + h = f.readline() + head = h.strip('\n').split(split_str) + logger.debug("表头长度: %s, %s" % (len(head), head)) + except UnicodeDecodeError: + f, code = read_file(file_path, error=True) + h = f.readline() + head = h.strip('\n').split(split_str) + logger.debug("表头长度: %s, %s" % (len(head), head)) + if '' in head: + data_column_count = head.index('') + if len(head) > data_column_count: + head = head[:data_column_count] + # print(head) + while True: + # line = None + try: + line = f.readline() + except UnicodeDecodeError: + logger.info('错误行: %s' % line) + continue + if not line: + break + data = line.strip('\n').split(split_str) + documents.append( + dict(zip(head, data[:data_column_count])) + ) + document_count += 1 + + if document_count >= 1e5: + shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq) + logger.info("数据条数到达 %s 保存一个文件: %s" % (document_count, shard)) + big_table = pd.DataFrame(documents) + logger.info("配置 : %s %s" % big_table.shape) + big_table.to_csv(shard, sep='\t', index=False) + file_seq += 1 + documents = [] + document_count = 0 + f.close() + + shard = os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\After", '%s.csv' % file_seq) + logger.info("数据条数到达 %s 保存最后一片: %s" % (1e5, shard)) + big_table = pd.DataFrame(documents) + big_table.to_csv(shard, sep='\t', index=False) + + logger.info("文件编码表: %s" % decode_table) + + +def merge_error_file(path, files: list): + """合并小的文件""" + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path, file) + small_table = pd.read_csv(file_full_path, low_memory=False, encoding_errors='ignore', on_bad_lines='skip') + print(small_table.shape) + big_table = pd.concat([big_table, small_table]) + start = 0 + split = 100000 + row, col = big_table.shape + file_idx = 101 + for x in range(start, row, split): + table = big_table[x: x + split] + table.to_csv(os.path.join('Y:\\zhaoxiangpeng\\2024BCR\\After', '%s.csv' % file_idx), index=False, sep='\t') + file_idx += 1 + + +def merge_standard_file(path): + files = os.listdir(path) + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path, file) + small_table = pd.read_csv(file_full_path, sep='\t', low_memory=False) + big_table = pd.concat([big_table, small_table]) + row, col = big_table.shape + split = 1000000 + file_idx = 1 + for x in range(0, row, split): + table = big_table[x: x + split] + table.to_csv(os.path.join("Y:\\zhaoxiangpeng\\2024BCR\\MergeFile", '%s.csv' % file_idx), index=False, sep='\t') + file_idx += 1 + + +def merge_small_file(path): + files = os.listdir(path) + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path, file) + small_table = pd.read_csv(file_full_path, index_col=False, low_memory=False, encoding_errors='ignore', on_bad_lines='skip') + big_table = pd.concat([big_table, small_table]) + row, col = big_table.shape + split = 800000 + file_idx = 1 + for x in range(0, row, split): + table = big_table[x: x + split] + table.to_excel(os.path.join("Y:\BCR\BCR202412\BCR2024书目补采API", '%s.xlsx' % file_idx), index=False) + file_idx += 1 + + +def find_eid_by_regex(text): + res = re.search(r'2-s2\.0-\d+', text) + if res: + return res.group(0) + return None + + +def batch_match(path): + count = 0 + line_count = 0 + eid_collect = [] + writer = open('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\eid.csv', 'a+', encoding='utf-8') + writer.write('EID'+'\n') + file_list = os.listdir(path) + for fname in file_list: + file = os.path.join(path, fname) + with open(file, encoding='utf-8') as f: + while line := f.readline(): + line_count += 1 + eid = find_eid_by_regex(line) + if not eid: + print(line) + else: + count += 1 + writer.write(eid + '\n') + writer.close() + print('总行数:%s\n匹配到:%s' % (line_count, count)) + + +def func11(): + path = 'Y:\\zhaoxiangpeng\\BCR\\2025BCR' + path2 = os.path.join(path, 'MergeFile') + files = os.listdir(path2) + big_table = pd.DataFrame() + for file in files: + file_full_path = os.path.join(path2, file) + small_table = pd.read_excel(file_full_path, engine='openpyxl', sheet_name=0) + small_table = small_table[['EID']] + print(small_table.shape) + big_table = pd.concat([big_table, small_table]) + big_table.drop_duplicates(subset=['EID'], inplace=True) + t2 = pd.read_csv(os.path.join(path, 'eid.csv')) + t2.drop_duplicates(subset=['EID'], inplace=True) + t2.rename(columns={'EID': "EID2"}, inplace=True) + t0 = pd.merge(t2, big_table, how='left', left_on=['EID2'], right_on=['EID']) + print(t0) + t0[t0['EID'].isna()]['EID2'].to_csv(os.path.join(path, 'eid2.csv'), index=False) + + +if __name__ == '__main__': + # merge_files_by_row("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据") + # merge_error_file("Y:\\zhaoxiangpeng\\2024BCR\\API采集数据", + # files=['ALL-CM.CSV', 'ALL-HDX.CSV', '失败记录第二次重采-20231228 15时53分下载.csv']) + # merge_standard_file('Y:\\zhaoxiangpeng\\2024BCR\\After') + merge_small_file(r'Y:\BCR\BCR202412\BCR2024书目补采API') + # batch_match('Y:\\zhaoxiangpeng\\BCR\\2025BCR\\API采集') + # func11() diff --git a/bnu_wos/20250108_func.py b/bnu_wos/20250108_func.py new file mode 100644 index 0000000..c401a08 --- /dev/null +++ b/bnu_wos/20250108_func.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/1/8 14:28 +# @Author : zhaoxiangpeng +# @File : 20250108_func.py + +import os +import pandas as pd +from pymongo import MongoClient + +READ_PATH = 'Y:\\wos-metadata\\SCHOOL\\bnu' + +DOI_SPLIT_SYMBOL = 'DOI ' +DOI_SPLIT_SYMBOL_LENGTH = len(DOI_SPLIT_SYMBOL) +INFO_SPLIT_SYMBOL = ', ' +INFO_SPLIT_SYMBOL_LENGTH = len(INFO_SPLIT_SYMBOL) + +# 引文表需要有一个UT字段用来作为主键 +REF_RECORD_TABLE_FIELD = ['UT', 'doi'] +# 完整记录要保留的字段 +FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'PT', 'UT', 'AB', 'SN', 'IS', 'EI', 'BN'] + + +client = MongoClient(config.MONGODB_REMOTE_CONFIG.get("url")) + + +def find_doi_data_from_mongo(): + pass + + +def doi_match_full_record(): + """ + 对引文的doi补充完整记录 + """ + doi_table = pd.DataFrame() + full_record_table = pd.DataFrame() + ref_table = doi_table.merge(right=full_record_table, how='left', left_on=['di'], right_on=['DI']) + ref_table.to_csv(os.path.join(os.path.join('Y:\数据采集需求\陶思琪20211203', 'doi展开')), '%s.csv' % '', index=False) + + +def func1(): + """ + 匹配查找没有下载到的wos号 + """ + os.listdir() + + +def verify_doi(text) -> str: + doi_idx = text.find(DOI_SPLIT_SYMBOL) + if doi_idx != -1: + doi_str = text[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:] + else: + doi_str = text + if doi_str.endswith(')'): + doi_str = doi_str[:-1] + return doi_str + + +def ref_str2dic(text): + ref_list = text.split('; ') + for ref in ref_list: + # print(ref) + """ + """ + # 解析引文字段的信息 + ref_copy = ref + var1 = [] + for _ in range(3): + idx_t = ref_copy.find(INFO_SPLIT_SYMBOL) + var1.append(ref_copy[:idx_t]) + ref_copy = ref_copy[idx_t + INFO_SPLIT_SYMBOL_LENGTH:] + au, py, so = var1 + doi_idx = ref.find(DOI_SPLIT_SYMBOL) + model = dict(au=au, py=py, so=so) + if doi_idx != -1: + doi_text = ref[doi_idx+DOI_SPLIT_SYMBOL_LENGTH:] + if doi_text.startswith('['): + doi_mutil_text = doi_text[1:-1] + doi_list = doi_mutil_text.split(', ') + for doi_str in doi_list: + if doi := verify_doi(doi_str): + obj = dict(doi=doi) + obj.update(**model) + yield obj + else: + if doi := verify_doi(doi_text): + obj = dict(doi=doi) + obj.update(**model) + yield obj + else: + obj = dict(doi=None) + obj.update(**model) + yield obj + + +def mian2(): + school_list = os.listdir(READ_PATH) + for ff in school_list: + if ff == 'doi展开' or ff == '待下载DOI': + continue + clear = [] + school_path = os.path.join(READ_PATH, ff) + file_list = os.listdir(school_path) + for file in file_list: + f = os.path.join(school_path, file) + table = pd.read_csv(f, sep='\t', error_bad_lines=False) + table = table[['UT', "CR"]] + values = table.values.tolist() + for value in values: + third_id, article_reference_text = value + if pd.isna(article_reference_text): + continue + _g = ref_str2dic(article_reference_text) + for r_doi in _g: + r_doi.setdefault('third_id', third_id) + clear.append(r_doi) + + table2 = pd.DataFrame(clear) + table2.to_csv(os.path.join(os.path.join('Y:\数据采集需求\陶思琪20211203', 'doi展开'), '%s.csv' % os.path.basename(school_path)), + index=False) + + +if __name__ == '__main__': + """ + text1 = 'Baskonus HM, 2019, APPL MATH NONLIN SCI, V4, P129, DOI 10.2478/AMNS.2019.1.00013; Deng M, 2020, OPT EXPRESS, V28, P24152, DOI 10.1364/OE.395204; Kaur D, 2020, APPL MATH NONLIN SCI, V5, P15, DOI 10.2478/AMNS.2020.2.00011; Le LT, 2020, MARIT POLICY MANAG, V47, P615, DOI 10.1080/03088839.2020.1729437; Lin HR, 2021, IEEE T CIRCUITS-I, V68, P3397, DOI 10.1109/TCSI.2021.3081150; Ray A, 2020, NAT HAZARDS, V103, P3523, DOI 10.1007/s11069-020-04141-2; Shariati M., 2021, J. Adv. Eng. Comput., V5, P50, DOI 10.25073/jaec.202151.308; Sharifi Y, 2020, IJST-T CIV ENG, V44, P579, DOI 10.1007/s40996-019-00281-z; Yariyan P, 2022, GEOCARTO INT, V37, P4312, DOI 10.1080/10106049.2021.1892208; Zhao DW, 2020, J MATER RES TECHNOL, V9, P1231, DOI 10.1016/j.jmrt.2019.11.050' + text2 = 'Booth W., 1979, CRITICAL UNDERSTANDI; Booth Wayne, 1988, THE COMPANY WE KEEP; Booth WC, 2005, COMPANION TO NARRATIVE THEORY, P75, DOI 10.1002/9780470996935.ch5; Booth Wayne C., 1983, THE RHETORIC OF FICT, P401; Booth Wayne C., 1961, THE RHETORIC OF FICT; Booth WayneC., 1994, UNDERSTANDING NARRAT, P99; Chatman Seymour, 1978, STORY AND DISCOURSE; Crane R. S., 1952, Critics and Criticism, P62; Emanuel JamesA., 1967, LANGSTON HUGHES; Herman D, 2008, PARTIAL ANSW, V6, P233, DOI 10.1353/pan.0.0019; Herman L, 2011, STYLE, V45, P11; Kindt T, 2011, STYLE, V45, P67; NELLES W, 1993, COMP LITERATURE, V45, P22, DOI 10.2307/1771304; Nunning Ansgar, 2005, THE ROUTLEGE ENCYCLO, P239; Nünning V, 2004, STYLE, V38, P236; Phelan J, 2011, STYLE, V45, P119; Phelan James, 1996, NARRATIVE RHETORIC T; Phelan James, 2005, LIVING TO TELL ABOUT; Phelan James., 2007, EXPERIENCING FICTION; RABINOWITZ PJ, 1977, CRIT INQUIRY, V4, P121, DOI 10.1086/447927; RABINOWITZ PJ, 1987, BEFORE READING NARRA; Rader Ralph W, 1978, U TORONTO Q, V48, P149; Rader RalphW., 1999, Ideology and Form in Eighteenth-Century Literature, P47; RADER RW, 1984, CRIT INQUIRY, V10, P567, DOI 10.1086/448264; Richardson B, 2011, STYLE, V45, P1; Ryan ML, 2011, STYLE, V45, P29; Shen D, 2007, J LITERARY SEMANTICS, V36, P53, DOI 10.1515/JLS.2007.003; Shen D, 2011, STYLE, V45, P576; Shen D, 2011, STYLE, V45, P80; Shen D, 2010, ENGL STUD, V91, P150, DOI 10.1080/00138380903355163; Shen D, 2008, NINETEEN CENT LIT, V63, P321, DOI 10.1525/ncl.2008.63.3.321; Shen Dan, THE LIVING HANDBOOK; Stefanescu M, 2011, STYLE, V45, P48; Tillotson Kathleen Mary., 1959, THE TALE AND THE TEL; Zerweck B, 2001, STYLE, V35, P151' + text3 = '[Anonymous], POSTMODERNISM AM LIT; BARTH J, 1980, ATLANTIC, V245, P65; Bertens Hans., 1986, Approaching Postmodernism: Papers Presented at a Workshop on Postmodernism, 21-23 September 1984, University of Utrecht, P9; Calinescu Matei., 1987, EXPLORING POSTMODERN; Calinescu Matei., 1987, 5 FACES MODERNITY MO; Cohen Ralph., 1989, POSTMODERN GENRES, P11; Connor S., 1989, POSTMODERNIST CULTUR; FANG F, 1992, ZHONGPIANXIAOSHUO XU, V2, P93; FOKKEMA Douwe., 1986, Approaching Postmodernism; Fokkema DouweW., 1984, Literary History, Modernism, and Postmodernism; FORSTER H, 1983, ANTI-AESTHETIC ESSAY; Habermas J., 1981, NEW GER CRIT, P3; HABERMAS J, ANTI-AESTHETIC ESSAY; Hassan Ihab., 1975, PARACRITICISMS 7 SPE; Hassan IhabH., 1987, The Postmodern Turn: Essays in Postmodern Theory and Culture; Hutcheon Linda., 1988, POETICS POSTMODERNIS; Hutcheon Linda., 1989, Politics of Postmodernism; Hutcheon Linda., 1989, Postmodern Genres. Ed, P54; Jameson Frederic., 1991, POSTMODERNISM, DOI DOI 10.1215/9780822378419-002; Jameson Fredric., 1981, The Political Unconscious: Narrative as a Socially Symbolic Act; KOHLER M, 1977, AMERIKASTUDIEN, V22, P8; LIU Z, 1992, WENYI ZHENGMIN DEBAT, V1, P73; Lyotard Jean-Francois., POSTMODERN CONDITION; MCHALE B, 1990, STYLE, V24, P1; McHale Brian., 1992, CONSTRUCTING POSTMOD; McHale Brian., 1987, POSTMODERNIST FICTIO; Nemoianu Virgil., 1989, A Theory of the Secondary: Literature, Progress, and Reaction; Perloff Marjorie., 1989, POSTMODERN GENRES; PUTZ M, 1984, POSTMODERNISM AM LIT; Ross Andrew., 1986, FAILURE MODERNISM SY; WANG N, 1993, SOCIAL SCI CHINA, V19, P5; WANG N, INPRESS NEW LIT HIST; Wilde Alan., 1981, HORIZONS ASSENT MODE' + text4 = "Abadi M, 2004, LECT NOTES COMPUT SC, V3142, P46; Abadi M, 2002, J CRYPTOL, V15, P103, DOI 10.1007/s00145-001-0014-7; Abadi M, 2001, ACM SIGPLAN NOTICES, V36, P104, DOI 10.1145/373243.360213; Abadi M., 1991, Proceedings of the Tenth Annual ACM Symposium on Principles of Distributed Computing, P201, DOI 10.1145/112600.112618; Accorsi R., 2001, LOGICAL ASPECTS CRYP, V55, P5, DOI [10.1016/S1571-0661(04)00242-7, DOI 10.1016/S1571-0661(04)00242-7]; Alur R, 1998, LECT NOTES COMPUT SC, V1427, P521, DOI 10.1007/BFb0028774; Alur R, 2007, LECT NOTES COMPUT SC, V4424, P664; Alur R, 2006, LECT NOTES COMPUT SC, V4052, P107; Anderson R, 1995, LECT NOTES COMPUT SC, V1000, P426; [Anonymous], 2004, INT C COMPUTER AIDED, DOI DOI 10.1007/978-3-540-27813-9_41; [Anonymous], 2009, J APPL NONCLASSICAL; Baltag A, 1999, Technical Report SEN-R9922; Baltag A, 2008, SYNTHESE, V165, P179, DOI 10.1007/s11229-008-9369-8; Baskar A., 2007, Proceedings of the 11th conference on Theoretical aspects of rationality and knowledge, P62; Bhargava M, 2005, LECT NOTES COMPUT SC, V3653, P171, DOI 10.1007/11539452_16; Bieber P., 1990, Proceedings. The Computer Security Foundations Workshop III (Cat. No.TH0315-2), P14, DOI 10.1109/CSFW.1990.128181; BROOKES SD, 1984, J ACM, V31, P560, DOI 10.1145/828.833; BURROWS M, 1990, ACM T COMPUT SYST, V8, P18, DOI [10.1145/77648.77649, 10.1145/74851.74852]; Chadha R, 2009, LECT NOTES COMPUT SC, V5522, P182, DOI 10.1007/978-3-642-02138-1_12; Chaum D., 1988, Proceedings of the Twentieth Annual ACM Symposium on Theory of Computing, P11, DOI 10.1145/62212.62214; Chaum D., 1988, Journal of Cryptology, V1, P65, DOI 10.1007/BF00206326; Ciobâca S, 2009, LECT NOTES ARTIF INT, V5663, P355, DOI 10.1007/978-3-642-02959-2_27; Clarke E. M., 1998, Programming Concepts and Methods. PROCOMET '98. IFIP TC2/WG2.2,2.3 International Conference, P87; Clarke EM, 1999, MODEL CHECKING, P1; COHEN M, 2005, P FCS 05, P121; Cohen M., 2005, METHODS MODALITIES, P202; Cohen M., 2009, P AAMAS, P945; Cohen M, 2007, IEEE S LOG, P77, DOI 10.1109/LICS.2007.4; Cohen M, 2009, 21ST INTERNATIONAL JOINT CONFERENCE ON ARTIFICIAL INTELLIGENCE (IJCAI-09), PROCEEDINGS, P721; Cremers C., 2006, Ph.D. dissertation; Dechesne F, 2008, LECT NOTES COMPUT SC, V5160, P111, DOI 10.1007/978-3-540-85762-4_8; Dechesne F, 2007, LECT NOTES ARTIF INT, V4790, P226, DOI 10.1007/978-3-540-75560-9_18; Delaune S, 2009, J COMPUT SECUR, V17, P435, DOI 10.3233/JCS-2009-0340; Delaune Stephanie., 2006, 19 IEEE COMPUTER SEC, P28; Dixon C., 2003, ULCS03022 U LIV DEP; DOLEV D, 1983, IEEE T INFORM THEORY, V29, P198, DOI 10.1109/TIT.1983.1056650; Durgin N., 1999, WORKSH FORM METH SEC; EMERSON EA, 1987, INFORM PROCESS LETT, V24, P77, DOI 10.1016/0020-0190(87)90097-4; Engelhardt K., 2002, ADV MODAL LOGIC, V9, P9; Engelhardt K, 2007, LECT NOTES COMPUT SC, V4514, P195, DOI 10.1007/978-3-540-72734-7_14; Fabrega F. J. T., 1999, Journal of Computer Security, V7, P191; Fagin R., 1995, Proceedings of the Fourteenth Annual ACM Symposium on Principles of Distributed Computing, P153, DOI 10.1145/224964.224982; Fagin R., 1995, Reasoning About Knowledge; Fischer MJ, 1996, J CRYPTOL, V9, P71, DOI 10.1007/s001459900004; Focardi R, 2004, LECT NOTES ARTIF INT, V2946, P139; Francien D., 2007, P WORKSH LOG RAT INT, P129; GARCIA F ERNANDEZ., 2005, Actas del XVI Congreso de ASELE, P63; Gerbrandy J., 1997, Journal of Logic, Language and Information, V6, P147, DOI 10.1023/A:1008222603071; Gong L., 1990, Proceedings. 1990 IEEE Computer Society Symposium on Research in Security and Privacy (Cat. No.90CH2884-5), P234, DOI 10.1109/RISP.1990.63854; Halpern J, 2002, P IEEE CSFW, P32, DOI 10.1109/CSFW.2002.1021805; Halpern J. Y., 1994, Theoretical Aspects of Reasoning About Knowledge. Proceedings of the Fifth Conference (TARK 1994), P255; Halpern J. Y., 2003, ACM Transactions on Information and Systems Security, V6, P43, DOI 10.1145/605434.605436; Halpern JY, 2005, J COMPUT SECUR, V13, P483; Halpern JY, 2011, ARTIF INTELL, V175, P220, DOI 10.1016/j.artint.2010.04.009; Halpern Joseph Y, 1991, Artificial Intelligence and Mathematical Theory of Computation. Papers in Honor of John McCarthy, V212, P151, DOI DOI 10.1016/B978-0-12-450010-5.50015-3; Halpern JosephY., 1986, Proceedings of the eighteenth annual ACM symposium on Theory of computing, P304; HALPERN JY, 1989, DISTRIB COMPUT, V3, P159, DOI 10.1007/BF01784885; Halpern JY, 2003, LECT NOTES COMPUT SC, V2629, P115; HALPERN JY, 1990, J ACM, V37, P549, DOI 10.1145/79147.79161; HINTIKKA J., 1962, Knowledge and Belief: An Introduction to the Logic of the Two Notions; Hommersom A, 2005, INFORMATION, INTERACTION, AND AGENCY, P289, DOI 10.1007/1-4020-4094-6_10; Hoshi T, 2009, SYNTHESE, V169, P259, DOI 10.1007/s11229-009-9552-6; Hunter A., 2007, Proceedings of the 22nd national conference on Artificial intelligence, V1, P427; Jonker HL, 2006, LECT NOTES COMPUT SC, V4176, P476; Jonker H. L., 2006, WORKSH TRUSTW EL 200; Kacprzak M, 2008, FUND INFORM, V85, P313; Kramer S., 2007, THESIS EPFL; LAUCHLI H, 1987, J SYMBOLIC LOGIC, V52, P219, DOI 10.2307/2273878; Lomuscio Alessio, 2007, SIGACT News, V38, P77, DOI 10.1145/1324215.1324231; Lomuscio A., 2006, 5 INT JOINT C AUT AG, P548; Lomuscio A, 2006, LECT NOTES COMPUT SC, V3920, P450; Lomuscio A, 2009, LECT NOTES COMPUT SC, V5643, P682, DOI 10.1007/978-3-642-02658-4_55; Lowe G., 1996, Tools and Algorithms for the Construction and Analysis of Systems. Second International Workshop, TACAS '96. Proceedings, P147; Meyden van der R., 2007, P 11 C THEOR ASP RAT, P212; NEEDHAM RM, 1978, COMMUN ACM, V21, P993, DOI 10.1145/359657.359659; Orzan S., 2005, LYS; Parikh R., 1985, Logics of Programs. Proceedings, P256; Parikh R., 2003, Journal of Logic, Language and Information, V12, P453, DOI 10.1023/A:1025007018583; Paulson L. C., 1998, Journal of Computer Security, V6, P85; Paulson LC, 1997, P IEEE CSFW, P70, DOI 10.1109/CSFW.1997.596788; Petride S., 2007, P 11 C THEORETICAL A, P239; Plaza J. A., 1989, P 4 INT S METH INT S, P201; Pucella R, 2006, J LOGIC COMPUT, V16, P287, DOI 10.1093/logcom/exi078; Ramanujam R, 2005, J COMPUT SECUR, V13, P135, DOI 10.3233/JCS-2005-13106; Ramanujam R., 2005, P TARK, P219; Reiter M. K., 1998, ACM T INFORM SYST SE, V1, P66, DOI DOI 10.1145/290163.290168; Ryan P.Y. A., 2001, MODELLING ANAL SECUR; Shilov Nikolay V., 2002, FICS, P25; Shmatikov V., 2004, Journal of Computer Security, V12, P355; Su K, 2004, PROCEEDING OF THE NINETEENTH NATIONAL CONFERENCE ON ARTIFICIAL INTELLIGENCE AND THE SIXTEENTH CONFERENCE ON INNOVATIVE APPLICATIONS OF ARTIFICIAL INTELLIGENCE, P98; SYVERSON P, 1992, J COMPUTER SECURITY, V1, P317; Syverson PF, 1999, LECT NOTES COMPUT SC, V1708, P814; Teepe W., 2006, P FAMAS 06, V6, P79; van Benthem J, 2006, INFORM COMPUT, V204, P1620, DOI 10.1016/j.ic.2006.04.006; van Benthem J, 2009, J PHILOS LOGIC, V38, P491, DOI 10.1007/s10992-008-9099-x; van Benthem Johan., 2007, P 11 C THEORETICAL A, P72; van der Hoek W., 2002, Proceedings of the First International Joint Conference on Autonomous Agents and Multiagent Systems, P1167; van der Meyden R, 2004, P IEEE CSFW, P280, DOI 10.1109/CSFW.2004.1310747; van der Meyden R., 1999, Foundations of Software Technology and Theoretical Computer Science. 19th Conference. Proceedings (Lecture Notes in Computer Science Vol.1738), P432; van Ditmarsch H., 2008, UNCONDITIONALLY SECU; van Ditmarsch HP, 2006, ELECTRON NOTES THEOR, V149, P105, DOI 10.1016/j.entcs.2005.07.029; van Ditmarsch H. P., 2003, Studia Logica, V75, P31, DOI 10.1023/A:1026168632319; van Eijck J., 2005, DEMO PROGRAM DOCUMEN; van Eijck J, 2007, ELECTRON NOTES THEOR, V168, P159, DOI 10.1016/j.entcs.2006.08.026; Von;;;;;;;;Wright GeorgH., 1951, ESSAY MODAL LOGIC; Wang Y., 2010, AAMAS 10 IN PRESS; WANG Y., 2009, Proceedings of the 12th Conference on Theoretical Aspects of Rationality and Knowledge, TARK'09 (California, 2009), P257" + text5 = 'ADAMS F, 2003, BLACKWELL GUIDE PHIL, P143; [Anonymous], 2005, The Oxford Handbook of Philosophy of Mathematics and Logic; Baker A, 2005, MIND, V114, P223, DOI 10.1093/mind/fzi223; Benacerraf P., 1973, Journal of Philosophy, V70, P661, DOI DOI 10.2307/2025075; Chihara C., 2005, OXFORD HDB PHILOS MA; Colyvan M, 1999, PHILOS STUD, V96, P1, DOI 10.1023/A:1004248218844; Davidson D., 1980, ESSAYS ACTIONS EVENT; Field H, 1972, J PHILOS, V69, P347, DOI DOI 10.2307/2024879; Field H., 1989, Realism, mathematics and modality; Field Hartry, 1980, Science Without Numbers; HELLMAN G, 2005, OXFORD HDB PHILOS MA; HODES HT, 1984, J PHILOS, V81, P123, DOI 10.2307/2026440; Leng Mary., 2007, Mathematical Knowledge; Maddy P., 1997, NATURALISM MATH; Maddy P., 2007, Second philosophy: A naturalistic method; Neander K., 2004, STANFORD ENCY PHILOS; Papineau David., 2007, The Stanford Encyclopedia of Philosophy; Quine W.V., 1995, STIMULUS SCI; Quine W.V.O., 1992, PURSUIT TRUTH; Rosen G, 2005, OXFORD HDB PHILOS MA; SOBER E, 1993, PHILOS REV, V102, P35, DOI 10.2307/2185652; Stoljar D., 2010, PHYSICALISM; Wang H., 2007, LOGICAL JOURNEY GODE; Yalowitz S., 2005, Stanford Encyclopedia of Philosophy; Ye F., SYNTHESE IN PRESS; YE F, INTRO NATURALISTIC P; Ye F., PHILOS MATH IN PRESS; YE F, STRUCTURAL THEORY CO' + + g = ref_str2dic(text4) + print([_ for _ in g]) + """ + mian2() diff --git a/bnu_wos/__init__.py b/bnu_wos/__init__.py new file mode 100644 index 0000000..a08425e --- /dev/null +++ b/bnu_wos/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/1/8 14:28 +# @Author : zhaoxiangpeng +# @File : __init__.py.py diff --git a/data_process_tool/__init__.py b/data_process_tool/__init__.py new file mode 100644 index 0000000..604e72b --- /dev/null +++ b/data_process_tool/__init__.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/8/5 8:58 +# @Author : ZAOXG +# @File : __init__.py + +from .file_read import read_data +from .file_write import write_data +from .merge_table import merge_table, get_dirs_file +from .change_head import rename_head + +__all__ = [ + 'read_data', + 'write_data', + 'merge_table', + 'get_dirs_file', + 'rename_head' +] diff --git a/data_process_tool/change_head.py b/data_process_tool/change_head.py new file mode 100644 index 0000000..d1d38fa --- /dev/null +++ b/data_process_tool/change_head.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/5/24 15:23 +# @Author : zhaoxiangpeng +# @File : change_head.py + +import pandas as pd +from typing import Union + + +def rename_head(table_or_head: Union[pd.DataFrame, list], postfix: str = '-other') -> dict: + if isinstance(table_or_head, pd.DataFrame): + table_or_head = table_or_head.columns.values.tolist() + new_head = {} + for head in table_or_head: + new_head[head] = str(head) + postfix + return new_head diff --git a/data_process_tool/condition_filter.py b/data_process_tool/condition_filter.py new file mode 100644 index 0000000..40117fe --- /dev/null +++ b/data_process_tool/condition_filter.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/9/20 9:05 +# @Author : ZAOXG +# @File : condition_filter.py + +import data_process_tool +import pandas as pd + + +def get_filter(text): + text_list = text.split('; ') + for t in text_list: + t.index() + + +def method1(table): + table_name = None + if isinstance(table, str): + table_name = str(table) + table = data_process_tool.read_data(table) + # 只保留唯一id列 UT, 发文地址列 C1 + # 讲地址列以';'分割, 筛选符合条件的行, 获取符合条件的id + # 原始表匹配符合条件的行, 保存匹配出结果的行 + new_table = table[['UT', 'C1']] + new_table.rename(columns={'UT': 'UT-BP'}, inplace=True) + locations = table['C1'].str.split('; ', expand=True) + locations = locations.stack() + locations = locations.reset_index(level=1, drop=True) + locations.name = 'LOCATION' + new_table = new_table.drop(['C1'], axis=1).join(locations) + new_table['LOCATION'] = new_table['LOCATION'].str.lower() + print(new_table) + # univ petr|petr univ|unit petr|univj petr 条件由wos检索策略来的 + # 筛选华东 + hd: pd.DataFrame = new_table[(new_table['LOCATION'].str.contains('univ petr|petr univ|unit petr|univj petr')) & (new_table['LOCATION'].str.contains('shandong|qingdao|dongying'))] + del hd['LOCATION'] + hd.drop_duplicates(subset=['UT-BP'], inplace=True) + hd = pd.merge(table, hd, 'left', left_on=['UT'], right_on=['UT-BP']) + hd = hd[hd['UT-BP'].notnull()] + del hd['UT-BP'] + hd.to_excel(table_name.replace('.xlsx', '-华东.xlsx'), index=False) + # 筛选北京 + bj: pd.DataFrame = new_table[(new_table['LOCATION'].str.contains('univ petr|petr univ|unit petr|univj petr')) & (new_table['LOCATION'].str.contains('beijing'))] + del bj['LOCATION'] + bj.drop_duplicates(subset=['UT-BP'], inplace=True) + bj = pd.merge(table, bj, 'left', left_on=['UT'], right_on=['UT-BP']) + bj = bj[bj['UT-BP'].notnull()] + del bj['UT-BP'] + bj.to_excel(table_name.replace('.xlsx', '-北京.xlsx'), index=False) + + +def condition_filter(table, filters: list = None): + table_name = None + if isinstance(table, str): + table_name = str(table) + table = data_process_tool.read_data(table) + table['C1_BP'] = table['C1'].str.lower() + hd = table[(table['C1_BP'].str.contains('shandong')) | (table['C1_BP'].str.contains('qingdao')) | (table['C1_BP'].str.contains('dongying'))] + del hd['C1_BP'] + hd.to_excel(table_name.replace('.xlsx', '华东.xlsx'), index=False) + bj = table[(table['C1_BP'].str.contains('beijing'))] + del bj['C1_BP'] + bj.to_excel(table_name.replace('.xlsx', '北京.xlsx'), index=False) + + +if __name__ == '__main__': + # condition_filter(r'F:\工作数据存储2022\20220919_中国石油大学wos\2020.xlsx') + method1(r'Z:\客户数据存储\WOS\中国石油大学\OG=(China University of Petroleum) AND (PY==(2022))\OG=(China University of Petroleum) AND (PY==(2022)).xlsx') diff --git a/data_process_tool/example.py b/data_process_tool/example.py new file mode 100644 index 0000000..38982e2 --- /dev/null +++ b/data_process_tool/example.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/9/21 15:12 +# @Author : ZAOXG +# @File : example.py + +import re +import pandas as pd +from data_process_tool import read_data + + +def format_conversion(file_name: str): + file_df = read_data(file_name) + # file_df['wss'].str + re.sub(r'<[A-Za-z]+>.*?', '', ) diff --git a/data_process_tool/file_read.py b/data_process_tool/file_read.py new file mode 100644 index 0000000..4ff5e3c --- /dev/null +++ b/data_process_tool/file_read.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/8/5 8:58 +# @Author : ZAOXG +# @File : file_read.py + +import chardet +import pandas as pd +import warnings + +__all__ = [ + 'read_data' +] + +file_type_operation = { + 'csv': pd.read_csv, + 'xlsx': pd.read_excel, + 'xls': pd.read_excel, + 'txt': pd.read_table, + 'xls2': pd.read_html +} + + +def read_data(file: str, **kwargs) -> pd.DataFrame: + if '.' in file: + file_type = file.rsplit('.')[-1] + else: + file_type = 'txt' + file_type = file_type.lower() + try: + # if file_type == 'txt': + # kwargs.update(sep='\t') + temp: pd.DataFrame = file_type_operation[file_type](file, **kwargs) + except UnicodeDecodeError: + warnings.warn('%s 编码异常,启用检查' % file) + with open(file, 'rb') as f: + data = f.read() + info = chardet.detect(data) + encoding = info['encoding'] + kwargs.update(encoding=encoding) + warnings.warn('%s 尝试使用 "%s" 解码' % (file, encoding)) + temp = read_data(file, **kwargs) + + return temp diff --git a/data_process_tool/file_write.py b/data_process_tool/file_write.py new file mode 100644 index 0000000..e5a365a --- /dev/null +++ b/data_process_tool/file_write.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/11/1 14:26 +# @Author : ZAOXG +# @File : file_write.py + + +import pandas as pd + +__all__ = [ + 'write_data' +] + +file_type_operation = { + 'csv': 'to_csv', + 'xlsx': 'to_excel', + 'xls': 'to_excel' +} + + +def write_data(data, file, index=True, **kwargs) -> pd.DataFrame: + file_type = file.rsplit('.')[-1] + temp: pd.DataFrame = getattr(data, file_type_operation[file_type])(file, index=index, **kwargs) + return temp diff --git a/data_process_tool/get_lose_year.py b/data_process_tool/get_lose_year.py new file mode 100644 index 0000000..e130660 --- /dev/null +++ b/data_process_tool/get_lose_year.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# @Time : 2024/3/19 15:18 +# @Author : zhaoxiangpeng +# @File : get_lose_year.py + +import os +import re + +from loguru import logger + +ROOT_PATH = "Y:\wos-metadata\issn-data" + +collection = dict() +has_year_collection = set() + +# 获取主路径下的所有ISSN文件夹 +master_dirs = os.listdir(ROOT_PATH) +for master_dir in master_dirs: + logger.debug('检测路径: %s' % master_dir) + # 一个ISSN的主文件下理应是以数字命名的文件或以IS=(xx)命名的文件夹 + issn_file_path = os.path.join(ROOT_PATH, master_dir) + child_dirs = os.listdir(issn_file_path) + single_year_list = [] + for child_dir_name in child_dirs: + # 判断是否是文件夹 + child_dir = os.path.join(issn_file_path, child_dir_name) + if not os.path.isdir(child_dir): + # logger.debug('检测 %s 不是一个文件夹, 跳过' % child_dir_name) + continue + else: + # 如果是文件夹,获取文件夹中的年份信息 + results = re.findall(r'={1,2}\((\d{4})\)', child_dir_name) + if results: + single_year_list.append(int(results[0])) + has_year_collection.add(master_dir) + # 把单个年份的列表排序取最大最小值,计算应该有的年份数量,进行对比 + if not single_year_list: + continue + single_year_list = sorted(single_year_list) + must_year_list = list(range(single_year_list[0], (single_year_list[-1]+1))) + if len(single_year_list) < len(must_year_list): + lose_year_list = set(must_year_list) - set(single_year_list) + lose_year_list = list(lose_year_list) + logger.warning("%s 有年份缺失, 缺失的年份有: %s" % (master_dir, list(lose_year_list))) + collection[master_dir] = lose_year_list + +logger.warning('有年份的文件夹: %s' % has_year_collection) +logger.warning('缺失年份的文件夹: %s' % collection) diff --git a/data_process_tool/merge_table.py b/data_process_tool/merge_table.py new file mode 100644 index 0000000..7005029 --- /dev/null +++ b/data_process_tool/merge_table.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/8/22 14:35 +# @Author : ZAOXG +# @File : merge_table.py + +import re +import os +import pandas as pd +import traceback +from pprint import pprint + +import csv + +from data_process_tool import read_data +from data_process_tool import write_data + + +def merge_table(path, file_list: list = None, to_type: str = 'xlsx', duplicates: list = None, on_columns: list = None, **kwargs): + """ + 合并指定目录下的所有文件 + """ + write_kwargs = {} + if kwargs.get('keep'): + write_kwargs['keep'] = kwargs.pop('keep', None) + if not file_list: + file_list = get_dirs_file(path, kwargs.pop('file_sort', False)) + r: pd.DataFrame = pd.DataFrame() + for file in file_list: + if file.endswith(('.zip', '.7z')): + continue + try: + temp_t = read_data(os.path.join(path, file), **kwargs) + print('%s -> %s' % (file, temp_t.shape)) + # temp_t['source'] = source + if on_columns and len(on_columns): + temp_t = temp_t[on_columns] + r = pd.concat([r, temp_t]) + except Exception as e: + traceback.print_exc() + print(e) + print(file) + if duplicates: + r.drop_duplicates(subset=duplicates, inplace=True, **write_kwargs) + file_name = re.split(r'\\|/', path)[-1] + full_name = os.path.join(path, file_name) + write_data(r, full_name+f'.{to_type}', index=False) + return path, r.shape[0] + + +def get_dirs_file(file_dir, file_sort: bool = False): + all_files = [] + for root, dirs, files in os.walk(file_dir): + if file_sort: + files.sort(key=lambda x: int(x.split('.')[0])) + for file in files: + all_files.append(os.path.join(root, file)) + for dir_ in dirs: + full_dir = os.path.join(root, dir_) + file_list = os.listdir(full_dir) + if file_sort: + file_list.sort(key=lambda x: float(x.rsplit('.', maxsplit=2)[0])) + for file in file_list: + all_files.append(os.path.join(full_dir, file)) + return all_files + + +def split_filename(path): + fileHead = {} + files = os.listdir(path) + for file in files: + filename, filetype = os.path.splitext(file) + head = re.split(r'\d', filename)[0] + fileHead.setdefault(head, []).append( + os.path.join(path, file) + ) + return fileHead + + +def merge_file2(file_list: list, filename: str = None, to_type: str = 'xlsx'): + r: pd.DataFrame = pd.DataFrame() + for file in file_list: + if file.endswith(('.zip', '.7z')): + continue + try: + temp_t = read_data(file) + # temp_t['source'] = file + r = pd.concat([r, temp_t]) + except Exception as e: + traceback.print_exc() + print(file) + write_data(r, filename + f'.{to_type}', index=False) + + +def merge_by_file_head(): + OUT_PATH = 'Z:/temp-data/集成电路4.11-2/作者合并' + mm = split_filename('Z:/temp-data/集成电路4.11-2/重新下载文件') + for m1, m2 in mm.items(): + merge_file2(m2, os.path.join(OUT_PATH, m1), to_type='csv') + + +def merge_csv_files(path): + file_name = re.split(r'\\|/', path)[-1] + full_name = os.path.join(path, file_name) + with open(full_name+'.csv', 'a', encoding='utf-8', newline='') as new_merge_file: + + files = os.listdir(path) + for file in files: + print(file) + if file.endswith('.csv'): + file_path = os.path.join(path, file) + try: + with open(file_path, 'r', encoding='utf-8') as f: + + new_merge_file.write(f.read()) + except Exception as exception: + print(exception) + print('合并完成') + + +def merge_txt_files(path): + file_name = re.split(r'\\|/', path)[-1] + full_name = os.path.join(path, file_name) + has_header = False + with open(full_name+'.csv', 'ab') as new_merge_file: + + files = os.listdir(path) + for file in files: + print(file) + if file.endswith('.txt'): + file_path = os.path.join(path, file) + try: + with open(file_path, 'rb') as f: + if not has_header: + head_line = f.readline() + new_merge_file.write(head_line) + + while line_data := f.readline(): + new_merge_file.write(line_data) + except Exception as exception: + print(exception) + print('合并完成') + + +def count_file_rows(path: str = None): + rows_count = 0 + files_count = 0 + files_info = {} + files = os.listdir(path) + for file in files: + file_path = os.path.join(path, file) + try: + with open(file_path, 'r', encoding='utf-8') as f: + lines = f.readlines() + rows_count += len(lines) + files_info[file] = len(lines) + except Exception as e: + files_info[file] = str(e) + files_count += 1 + return { + 'files_path': path, + 'rows_count': rows_count, + 'files_count': files_count, + 'files_info': files_info, + 'exclude_head': rows_count - files_count + } + + +if __name__ == '__main__': + # merge_table(fr'Y:\zhaoxiangpeng\BCR\2025BCR\API采集', encoding_errors='ignore', on_bad_lines='skip') + # merge_table(fr'Z:\文章摘要推荐池\toppaper核心数据\初始文章数据\2023-3月\Highly Cited Papers', on_columns=['EID'], encoding_errors='ignore', on_bad_lines='skip') + # merge_table(fr'Z:\文章摘要推荐池\toppaper核心数据\初始文章数据\2023-3月\Highly Cited Papers', encoding_errors='ignore', on_bad_lines='skip', engine='python', skiprows=1, skipfooter=2) + # merge_table(fr'Y:\wos-metadata\wos increment-202403\03', encoding_errors='ignore', on_bad_lines='skip', duplicates=['UT'], keep='last', file_sort=False) + # merge_table(fr'Y:\zhaoxiangpeng\2024BCR\API采集数据', encoding_errors='ignore', on_bad_lines='skip', to_type="txt") + merge_table(fr"Z:\客户数据存储\WOS\清华大学\OG=(Tsinghua University) AND (PY==(2024))", duplicates=['UT'], encoding_errors='ignore', on_bad_lines='skip', keep='last', file_sort=True) + # merge_table(fr"Y:\数据采集需求\wosid下载\incites教育部学科数据", duplicates=['入藏号'], keep='last', on_columns=['入藏号', 'DOI'], on_bad_lines='skip') + # merge_table(fr"F:\工作数据存储2025\20250101_中文发文引文更新\2024", duplicates=['网址'], keep='last', to_type='csv') + # merge_table(fr'Z:\客户数据存储\专利\中南大学\23~', to_type='csv') + # merge_table(fr'Y:\wos-metadata\SCHOOL\bj\待下载DOI', to_type='csv', duplicates=['DO']) + # merge_table(fr'F:\工作数据存储2023\20231221_北京师范大学\补充数据\EI', on_bad_lines='skip', on_columns=['Accession number'], duplicates=['Accession number'], to_type='xlsx') # duplicates=['Title', 'Accession number'] + # merge_table(fr'Z:\文章摘要推荐池\toppaper核心数据\初始文章数据\2023-3月\Highly Cited Papers', encoding="ISO-8859-1", on_bad_lines='skip', engine='python', skiprows=1, skipfooter=2, duplicates=['Accession Number'], to_type='xlsx') + # merge_by_file_head() + # file_s = ['Z:/客户数据存储/TEMP\\电子科技大学\\电子科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西安建筑科技大学\\西安建筑科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\四川大学\\四川大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中山大学\\中山大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中国农业大学\\中国农业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\江苏师范大学\\江苏师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中国人民大学\\中国人民大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\上海工程技术大学\\上海工程技术大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西安交通大学\\西安交通大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\首都师范大学\\首都师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西北工业大学\\西北工业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\重庆交通大学\\重庆交通大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\清华大学\\清华大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\长安大学\\长安大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\三江学院\\三江学院2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\华东师范大学\\华东师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\福州大学\\福州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\安徽医科大学\\安徽医科大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\扬州大学\\扬州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\北京理工大学\\北京理工大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\兰州大学\\兰州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\江汉大学\\江汉大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\上海电力大学\\上海电力大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\华东理工大学\\华东理工大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西安电子科技大学\\西安电子科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西北师范大学\\西北师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\上海财经大学\\上海财经大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\三峡大学\\三峡大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京中医药大学\\南京中医药大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\浙江大学\\浙江大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中南大学\\中南大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\同济大学\\同济大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中国科学技术大学\\中国科学技术大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中欧国际工商学院\\中欧国际工商学院2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\合肥工业大学\\合肥工业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京航空航天大学\\南京航空航天大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京工程学院\\南京工程学院2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\江南大学\\江南大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\安徽建筑大学\\安徽建筑大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中南财经政法大学\\中南财经政法大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\河南师范大学\\河南师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\曲阜师范大学\\曲阜师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京农业大学\\南京农业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京林业大学\\南京林业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西南财经大学\\西南财经大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京医科大学\\南京医科大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\国防科技大学\\国防科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京大学\\南京大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京师范大学\\南京师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\深圳大学\\深圳大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\华中科技大学\\华中科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\成都理工大学\\成都理工大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西南科技大学\\西南科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\江苏大学\\江苏大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中国矿业大学\\中国矿业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\闽江学院\\闽江学院2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\吉林大学\\吉林大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\中国地质大学(武汉)\\中国地质大学(武汉)2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\陕西师范大学\\陕西师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\常州大学\\常州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西南交通大学\\西南交通大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\苏州大学\\苏州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\东南大学\\东南大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\首都医科大学\\首都医科大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\湖南师范大学\\湖南师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\北京科技大学\\北京科技大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京审计大学\\南京审计大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\郑州大学\\郑州大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京工业大学\\南京工业大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\湖南师范\\湖南师范2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\北京师范大学\\北京师范大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\西南民族大学\\西南民族大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\深圳技术大学\\深圳技术大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\南京财经大学\\南京财经大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\天津大学\\天津大学2023-2023中文发文.csv', 'Z:/客户数据存储/TEMP\\河海大学\\河海大学2023-2023中文发文.csv'] + # + # merge_file2(file_list=file_s, filename='F:/工作数据存储2023/20230615_中文发文引文更新/2023', to_type='csv') + # d = read_data(r'Z:\客户数据存储\专利\南京大学\南京大学-专利网.xlsx') + # d.to_csv(r'Z:\客户数据存储\专利\南京大学\南京大学-专利网.csv', index=False) + # merge_csv_files(r'Z:\客户数据存储\WOS\南京工业大学\OG=(Nanjing Tech University) AND (PY==(2022))') + # pprint(count_file_rows(r'Z:\客户数据存储\EI\东南大学\2023')) + """ + path_root = 'Z:\客户数据存储\WOS\成都理工大学' + for x in os.listdir(path_root): + # if x != '2002': + # continue + # Accession number + # merge_table(os.path.join(path_root, x), on_columns=['Accession number'], duplicates=['Accession number'], to_type='csv') + merge_table(os.path.join(path_root, x), duplicates=['UT'], keep='last', file_sort=True) + """ diff --git a/doi_parse/__init__.py b/doi_parse/__init__.py new file mode 100644 index 0000000..0a2da39 --- /dev/null +++ b/doi_parse/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/5/31 14:38 +# @Author : ZhaoXiangPeng +# @File : __init__.py diff --git a/doi_parse/getkeys.py b/doi_parse/getkeys.py new file mode 100644 index 0000000..129f125 --- /dev/null +++ b/doi_parse/getkeys.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/2 14:02 +# @Author : ZhaoXiangPeng +# @File : getkeys.py + +from ReSpider.db.redisdb import RedisDB +import pandas as pd +import json +from typing import List, Dict + + +class GetCount: + def __init__(self, db: int = 1, write_path: str = None): + self.client = RedisDB(db=db) + self.write_path = write_path or 'E:/inspec/' + + def get_keys(self) -> List[bytes]: + return self.client.keys(pattern='*-*') + + def get_kv(self, key) -> dict: + return self.client.hgetall(key) + + @staticmethod + def format(records: dict, key: str) -> list: + temp = [] + for k, v in records.items(): + record: dict = {'doi': k.decode('utf-8'), 'count': int(v.decode('utf-8')), 'issn': key} + temp.append(record) + return temp + + def to_csv(self, data: list = None, file_name: str = None): + df = pd.DataFrame(data) + df.to_csv(self.write_path+file_name+'.csv', index=False) + + def aa(self): + redis_keys: List[bytes] = self.get_keys() + for redis_key in redis_keys: + key_string: str = redis_key.decode('utf-8') + kvs: dict = self.get_kv(key_string) + print('*'*5 + key_string + '*'*5) + key_records = self.format(kvs, key_string) + self.to_csv(data=key_records, file_name=key_string) + + +if __name__ == '__main__': + gc = GetCount(db=2) + # gc.get_keys() + gc.aa() diff --git a/doi_parse/gz.py b/doi_parse/gz.py new file mode 100644 index 0000000..008e967 --- /dev/null +++ b/doi_parse/gz.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/5/31 8:52 +# @Author : ZhaoXiangPeng +# @File : gz.py + +import gzip +import json +import ujson + + +def data_parse(io): + data = json.load(io) + items = data['items'] + for item in items: + doi = item.get('DOI') + dtype = item.get('type') + issn = item.get('ISSN') + title = item.get('title') and item.get('title')[0] + source = item.get('container-title') and item.get('container-title')[0] + print('****************************************************\n' + 'TITLE: %s\n' + 'DOI: %s\n' + 'TYPE: %s\n' + 'ISSN: %s\n' + 'SOURCE: %s\n' % (title, doi, dtype, issn, source)) + if not item.get('reference-count', 0): + continue + try: + reference_list = [] + for reference in item.get('reference', []): + ref_doi = reference.get('DOI') + if ref_doi: + # do something + reference_list.append(ref_doi) + continue + ref_at = reference.get('article-title') + if ref_at: + print(ref_at) + reference_list.append(ref_at) + continue + ref_jt = reference.get('journal-title') + except KeyError: + print(item.keys()) + + +def un_gz(file_name): + g_file = gzip.GzipFile(file_name) + return g_file + + +if __name__ == '__main__': + un_gz('H:/crossref_public_data_file_2021_01/1.json.gz') diff --git a/doi_parse/inspec2mongo.py b/doi_parse/inspec2mongo.py new file mode 100644 index 0000000..a8e5144 --- /dev/null +++ b/doi_parse/inspec2mongo.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/8 16:54 +# @Author : ZhaoXiangPeng +# @File : inspec2mongo.py + +from ReSpider.db.mongodb import AsyncMongoDB +from ReSpider.extend.logger import LogMixin +from doi_parse.inspec2redis import InspecToRedis +import asyncio +import os +import json +import time +import logging + + +DOI_LIST = {'10.19381/j.issn.1001-7585.2020.21.008', '10.14111/j.cnki.zgfx.2019.06.004', '10.13530/j.cnki.jlis.190047'} + + +class AsyncBase(LogMixin): + def __init__(self, task_list: list, limit: int = 6, loop=None): + super().__init__() + self.TASK_LIST: list = task_list + self.loop = loop or asyncio.get_event_loop() + self.limit = limit + + def add_callback(self, func): + """ + 添加任务处理方法 + """ + self.call_func = func + + async def next_task(self): + semaphore = asyncio.Semaphore(value=self.limit) + while True: + try: + task = self.TASK_LIST.pop() + await semaphore.acquire() + self.loop.create_task( + self.call_func(task, semaphore)) + except IndexError: + await asyncio.sleep(3) + if len(asyncio.all_tasks(loop=self.loop)) <= 1: + break + + def execute(self): + self.logger.info('TASK INIT SUCCESS.') + try: + self.loop.run_until_complete( + self.next_task() + ) + except Exception as e: + self.logger.error('execute %s' % e, exc_info=True) + finally: + self.loop.run_until_complete( + self.loop.shutdown_asyncgens()) + self.loop.stop() + self.logger.info('THE END.') + + +class InspecToMongo(InspecToRedis): + def __init__(self, db=None, file_list=None, root: str = None): + super().__init__(file_list=file_list) + self.db: AsyncMongoDB = db + self.file_list: list = file_list + print('初始化任务 %s 个' % len(file_list)) + self.root = root or 'H:/crossref_public_data_file_2021_01/' + + async def process(self, file=None, semaphore=None): + if not file and self.file_list.__len__(): + file = self.get_next() + if not file: + return + print('******************** 当前处理文件 %s ********************' % file) + + io = self.compatible(file) + await self.data_parse(io, semaphore) + + async def data_parse(self, io, semaphore=None): + data = json.load(io) + io.close() + items = data['items'] + for item in items: + doi = item.get('DOI') + doi_tag = False + # dtype = item.get('type') + # issn = item.get('ISSN') + # title = item.get('title') and item.get('title')[0] + # source = item.get('container-title') and item.get('container-title')[0] + # publisher = item.get('publisher') + ref_count = item.get('reference-count', 0) + """ + print('****************************************************\n' + 'TITLE: %s\n' + 'DOI: %s\n' + 'TYPE: %s\n' + 'ISSN: %s\n' + 'SOURCE: %s\n' % (title, doi, dtype, issn, source))""" + """ + # mongodb存一份 + ser_item = {'TITLE': title, 'DOI': doi, 'TYPE': dtype, 'PUBLISHER': publisher, + 'REFERENCE-COUNT': ref_count, + 'SOURCE': source, 'ISSN': issn} + """ + if doi in DOI_LIST: + doi_tag = True # 如果doi为所需的doi,则所有参考文献都加入引用表, 且不太可能引用自己 + if not ref_count: + continue + try: + reference_list = [] + for reference in item.get('reference', []): + ref_doi = reference.get('DOI') + if not ref_doi: + # 没有doi直接跳过 + continue + # 有doi的逻辑 + if doi_tag: + doi_in = {'doi': doi, 'ref_doi': ref_doi} + print(doi_in) + reference_list.append(doi_in) + continue + elif ref_doi in DOI_LIST: + ref_doi_in = {'doi': doi, 'ref_doi': ref_doi} + print(ref_doi_in) + reference_list.append(ref_doi_in) + continue + # ref_at = reference.get('article-title') + # if ref_at: + # # print(ref_at) + # # reference_list.append(ref_at) + # continue + # ref_jt = reference.get('journal-title') + except KeyError: + print(item.keys()) + else: + if not reference_list: + continue + # print(reference_list[0]) + await self.db.add_batch('data_crossref_doirelation', reference_list) + semaphore.release() # 释放锁 + + def start(self): + aio_task = AsyncBase(self.file_list, limit=6) + aio_task.add_callback(self.process) + aio_task.execute() + + +if __name__ == '__main__': + mdb = AsyncMongoDB(host='127.0.0.1', port=27017, db='data_crossref') + files = InspecToRedis.load_gz('H:/crossref_public_data_file_2021_01') + files = files[20000:] + # files = ['0.json'] + s = time.time() + i2m = InspecToMongo( + db=mdb, + file_list=files + ) + i2m.start() + # i2r.inspec2redis() + print('耗时 %s 秒' % (time.time() - s)) diff --git a/doi_parse/inspec2redis.py b/doi_parse/inspec2redis.py new file mode 100644 index 0000000..0b5b7d2 --- /dev/null +++ b/doi_parse/inspec2redis.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/5/31 10:02 +# @Author : ZhaoXiangPeng +# @File : inspec2redis.py + +from ReSpider.db.redisdb import RedisDB +from doi_parse.gz import un_gz +from queue import Queue +import pandas as pd +import os +import time +import json +from concurrent.futures import ThreadPoolExecutor + + +class Counter: + def __init__(self, clint=None): + self.client: RedisDB = clint + if clint is None: + self.client = RedisDB(db=2) + + def incr(self, key: list, counter: list): + """ + key: 可以作为刊唯一id的 redis key + counter: 可以作为文章唯一id的 doi 或 title + """ + pipe = self.client._redis.pipeline() + pipe.multi() + for k in key: + k = k.upper() + if not self.sismember('inspec:journals', k): + continue + for value in counter: + pipe.hincrby(k, value) + pipe.execute() + + def decr(self, key: str, counter: str, amount=1): + """计数累减""" + return self.client.hincrby(key, counter, amount=-amount) + + def get_cnt(self, key: str, counter: str): + """获取当前计数的值""" + return self.client.hget(key, counter) + + def sismember(self, key, value): + return self.client.sismember(key, value) + + +class InspecToRedis: + """ + 1. 加载文件 + 2. 解压 + 3. 解析 + 4. 入库 + 4.1 ISSN 关联 + 4.2 ISSN1 + doi1 1 + doi2 1 + ISSN2 + doi1 2 + doi2 1 + """ + def __init__(self, counter=None, file_list=None, root: str = None): + self.counter = counter # 初始化计数器 + self.file_list: list = file_list + print('初始化任务 %s 个' % len(file_list)) + # self.to_queue(file_list) + self.root = root or 'H:/crossref_public_data_file_2021_01/' + + @staticmethod + def load_gz(file_path): + return os.listdir(file_path) + + # def to_queue(self, file_list): + # for file in file_list: + # self.file_list.put_nowait(file) + # print('*'*50, '注入队列完成', '*'*50) + + def inspec2redis(self): + df = pd.read_csv('F:/工作数据存储2022/20220526_inspec测试/inspec期刊列表2.csv') + issn_list = df['ISSN'].values.tolist() + return self.counter.client.sadd('inspec:journals', issn_list) + + def get_next(self): + item = self.file_list.pop() + return item + + def compatible(self, file): + if file[-2:] == 'gz': + io = un_gz(self.root+file) + else: + io = open(self.root+file, encoding='utf-8') + return io + + def data_parse(self, io): + data = json.load(io) + items = data['items'] + for item in items: + doi = item.get('DOI') + dtype = item.get('type') + issn = item.get('ISSN') + title = item.get('title') and item.get('title')[0] + source = item.get('container-title') and item.get('container-title')[0] + publisher = item.get('publisher') + ref_count = item.get('reference-count', 0) + """ + print('****************************************************\n' + 'TITLE: %s\n' + 'DOI: %s\n' + 'TYPE: %s\n' + 'ISSN: %s\n' + 'SOURCE: %s\n' % (title, doi, dtype, issn, source))""" + """ + # mongodb存一份 + ser_item = {'TITLE': title, 'DOI': doi, 'TYPE': dtype, 'PUBLISHER': publisher, + 'REFERENCE-COUNT': ref_count, + 'SOURCE': source, 'ISSN': issn} + """ + if not ref_count: + continue + try: + reference_list = [] + for reference in item.get('reference', []): + ref_doi = reference.get('DOI') + if ref_doi: + # do something + # print(ref_doi) + reference_list.append(ref_doi) + continue + ref_at = reference.get('article-title') + if ref_at: + # print(ref_at) + # reference_list.append(ref_at) + continue + ref_jt = reference.get('journal-title') + except KeyError: + print(item.keys()) + else: + self.counter.incr(issn, reference_list) + + def pro(self, file=None): + # print('剩余任务 %s 个' % self.file_list.__len__()) + if not file and self.file_list.__len__(): + file = self.get_next() + if not file: + return + print('******************** 当前处理文件 %s ********************' % file) + + io = self.compatible(file) + self.data_parse(io) + io.close() + + def batch(self): + with ThreadPoolExecutor(max_workers=2) as executor: + executor.map(self.pro, self.file_list) + + def start(self): + index_count = 0 + while len(self.file_list): + index_count += 1 + print('当前处理第 %s 个' % index_count) + self.pro() + + +if __name__ == '__main__': + count = Counter() + # files = InspecToRedis.load_gz('H:/crossref_public_data_file_2021_01') + # files = files[40000:] + files = ['0.json'] + i2r = InspecToRedis(counter=count, file_list=files) + s = time.time() + i2r.batch() + # i2r.inspec2redis() + print('耗时 %s 秒' % (time.time()-s)) diff --git a/doi_parse/inspec数据库施引测试数据.csv b/doi_parse/inspec数据库施引测试数据.csv new file mode 100644 index 0000000..d92dabe --- /dev/null +++ b/doi_parse/inspec数据库施引测试数据.csv @@ -0,0 +1,443 @@ +doi,count,issn +10.1002/cite.201800135,3.0,0009-286X; 2058-9883 +10.1002/cnm.3180,2.0,1617-7959; 2267-1242 +10.1002/eng2.12217,1.0,1474-0346 +10.1002/geot.201800058,2.0,1865-7362 +10.1002/geot.201900076,2.0,0886-7798; 1865-7362 +10.1002/inst.12189,2.0,2156-485X; 0937-7255 +10.1002/inst.12279,1.0,1098-1241 +10.1002/j.2334-5837.2019.00598.x,2.0,2334-5837 +10.1002/lpor.202000254,1.0,1996-1944 +10.1002/sys.21503,1.0,1539-7734 +10.1007/978-3-030-04849-5_31,1.0,2078-2489 +10.1007/978-3-030-23162-0_4,1.0,1742-6596 +10.1007/978-3-030-23703-5_2,1.0,0020-7543 +10.1007/978-3-030-27878-6_5,1.0,2504-4494 +10.1007/978-3-030-29513-4_72,1.0,2076-3417 +10.1007/978-3-030-29852-4_30,1.0,1424-8220 +10.1007/978-3-030-30985-5_4,1.0,0925-9856 +10.1007/978-3-030-34644-7_40,1.0,2220-9964 +10.1007/978-3-030-34986-8_41,2.0,2161-3915 +10.1007/978-3-030-38077-9_75,1.0,0042-3114 +10.1007/978-3-030-42416-9_18,1.0,2071-1050 +10.1007/978-3-319-59153-7_13,1.0,0941-0643 +10.1007/978-3-319-65298-6_1,3.0,0020-7543; 1064-1246; 1068-798X +10.1007/978-981-15-0802-8_199,1.0,2076-3417 +10.1007/s00170-011-3370-y,4.0,2234-7593; 0268-3768; 0043-1648 +10.1007/s00170-013-5155-y,10.0,2076-3417; 2071-1050; 0377-2217; 0925-5273; 0951-192X; 0018-7208; 1387-3954; 0268-3768; 1996-1073; 1751-7575 +10.1007/s00170-014-6091-1,3.0,0268-3768; 0263-2241; 1424-8220 +10.1007/s00170-018-1617-6,43.0,0888-3270; 2041-1723; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 0141-9331; 0890-0604; 2071-1050; 2227-9717; 0360-3199; 1050-0472; 1022-0038; 0020-7543; 0013-7944; 1530-9827; 1742-6596; 1729-8814; 0951-192X; 0929-6212; 1747-7778; 1617-9846; 1751-7575 +10.1007/s00170-018-2001-2,11.0,2076-3417; 2504-4494; 2227-9717; 1530-8669; 0951-192X; 2288-6206; 0268-3768; 1068-798X +10.1007/s00170-018-2748-5,12.0,0360-5442; 0161-0457; 2076-3417; 2504-4494; 0925-5273; 0951-192X; 1022-0038; 0950-7051; 0268-3768 +10.1007/s00170-019-03794-z,4.0,0951-192X; 0268-3768; 0020-7543; 2234-7593 +10.1007/s00170-019-04653-7,8.0,0161-0457; 2504-4494; 2288-6206; 0268-3768; 0020-7543 +10.1007/s00170-019-04706-x,3.0,1050-0472; 2076-3417; 0268-3768 +10.1007/s00170-020-05056-9,4.0,2071-1050; 1996-1073; 0268-3768 +10.1007/s00170-020-05387-7,1.0,0268-3768 +10.1007/s00170-020-05567-5,1.0,1059-9495 +10.1007/s00170-020-05977-5,3.0,0268-3768; 2076-3417 +10.1007/s00231-016-1961-8,9.0,0145-8892; 1556-3758; 0177-0667; 0947-7411; 0145-8876; 0921-8831 +10.1007/s10257-018-0376-0,1.0,2504-4494 +10.1007/s10586-016-0618-1,2.0,1386-7857 +10.1007/s10586-018-2041-2,2.0,1367-5567; 0020-7543 +10.1007/s10845-017-1350-2,28.0,0360-5442; 2076-3417; 1862-4472; 1568-4946; 2071-1050; 1092-7026; 2472-5854; 0268-3768; 0951-192X; 0944-6524; 1024-123X; 1063-293X; 1687-8140; 1741-038X; 0925-2312; 2571-5577; 0020-7543; 1064-1246 +10.1007/s10845-019-01500-0,1.0,2504-4494 +10.1007/s10845-019-01512-w,4.0,2071-1050; 1076-2787; 2076-3417; 1996-1073 +10.1007/s10845-019-01516-6,2.0,2076-3417; 1996-1073 +10.1007/s11831-018-9301-4,15.0,1134-3060; 2504-4494; 2213-7467; 1996-1944; 2523-3963; 0029-5981; 1960-6206; 1438-7492; 0268-3768; 1996-1073; 2311-5521 +10.1007/s11837-017-2709-8,7.0,2214-8604; 1047-4838; 1424-8220; 0964-1726 +10.1007/s11837-020-04028-4,3.0,2214-8604; 1047-4838; 0264-1275 +10.1007/s12008-012-0165-9,13.0,0965-9978; 1530-9827; 0954-4062; 1955-2513; 2213-8463; 0268-3768; 0020-7543 +10.1007/s12008-013-0201-4,4.0,2261-236X; 0723-2632; 1996-1944; 1955-2513 +10.1007/s12008-016-0319-2,11.0,2076-3417; 1742-6596; 1955-2513; 0951-192X; 1868-5137; 1741-038X; 0954-4054; 1751-7575; 0020-7543 +10.1007/s12008-019-00578-3,1.0,1955-2513 +10.1007/s12008-019-00621-3,1.0,2227-9717 +10.1007/s12008-020-00694-5,1.0,1350-4533 +10.1007/s12161-013-9634-4,8.0,0149-6085; 0022-5142; 1541-4337; 0145-8876; 1936-9751 +10.1007/s12517-019-4574-y,1.0,1070-9622 +10.1007/s12599-019-00624-0,2.0,2076-3417; 1350-1917 +10.1007/s12652-018-0881-5,31.0,0888-3270; 1077-5463; 1424-8220; 1868-5137; 0268-3768; 0926-5805; 0096-3003; 1562-2479; 2071-1050; 1367-5788; 2227-9717; 2095-8099; 0950-7051; 1687-8140; 0020-7543; 2504-4494; 0013-7944; 1742-6596; 1729-8814; 0951-192X; 2213-8463; 0929-6212 +10.1007/s12652-018-0911-3,13.0,2076-3417; 2071-1050; 1742-6596; 0263-2241; 0951-192X; 2095-8099; 1350-6307; 0926-5805; 0268-3768; 0954-4054 +10.1007/s12652-018-0944-7,3.0,1678-5878; 1742-6596; 0954-4054 +10.1007/s12652-018-0946-5,15.0,2076-3417; 0954-4054; 2504-4494; 1729-8814; 0951-192X; 2073-8994; 1868-5137; 1350-6307; 0268-3768; 1678-5878; 0020-7543 +10.1007/s12652-018-0953-6,9.0,2504-4494; 0263-2241; 1729-8814; 0951-192X; 1868-5137; 1741-0398 +10.1007/s12652-018-1125-4,7.0,2504-4494; 0268-3768; 1868-5137; 1847-9790 +10.1007/s13349-020-00403-6,2.0,2190-5452; 1424-8220 +10.1007/s40430-020-02461-9,1.0,0268-3768 +10.1007/s40684-020-00196-5,2.0,2234-7593; 2076-3417 +10.1007/s40684-020-00227-1,3.0,2234-7593; 0737-8831; 0020-7543 +10.1007/s40860-018-0069-y,2.0,0929-6212; 1022-0038 +10.1016/j.apm.2019.09.036,4.0,0263-2241; 2470-0045; 0888-3270; 0307-904X +10.1016/j.arcontrol.2019.01.001,2.0,2213-8463; 0020-7543 +10.1016/j.autcon.2019.102837,11.0,1134-3060; 2071-1050; 1093-9687; 1475-9217; 0926-5805; 1687-8086; 1545-2255 +10.1016/j.autcon.2019.102915,4.0,0926-5805; 2076-3417; 1474-0346; 2631-4428 +10.1016/j.autcon.2019.102930,9.0,2076-3417; 0969-9988; 0268-4012; 2071-1050; 1471-4175; 1350-6307; 0926-5805; 0020-7543 +10.1016/j.autcon.2020.103179,12.0,2076-3417; 1618-954X; 2071-1050; 1424-8220; 1474-0346; 0926-5805; 1687-8086 +10.1016/j.autcon.2020.103183,2.0,2071-1050; 1687-8086 +10.1016/j.autcon.2020.103277,1.0,2076-3417 +10.1016/j.cad.2011.07.007,6.0,0965-9978; 0951-192X; 0944-6524; 1745-2759; 0094-2405 +10.1016/j.cherd.2012.08.004,28.0,0959-3330; 1932-2135; 0306-2619; 0145-8892; 1618-954X; 1385-8947; 1568-4946; 1556-7036; 1876-1070; 2077-0375; 0376-7388; 0360-3199; 0926-860X; 0167-7322; 0930-7516; 0022-1481; 0920-4105 +10.1016/j.cirp.2017.04.038,53.0,0954-4062; 0944-6524; 1868-5137; 0268-3768; 2075-1702; 2076-3417; 1478-0771; 2332-9017; 1474-0346; 0965-9978; 0161-0457; 0890-0604; 2071-1050; 1367-5788; 1087-1357; 1050-0472; 0935-1175; 2095-8099; 0020-7543; 0013-7944; 1530-9827; 0263-2241; 2267-1242; 0951-192X; 1751-7575 +10.1016/j.cirp.2017.04.040,80.0,1098-1241; 0378-7788; 1996-1944; 1424-8220; 1955-2513; 0029-5981; 0944-6524; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 1999-5903; 2332-9017; 2053-4701; 1474-0346; 2516-8398; 0926-5805; 2227-7080; 0735-3766; 1064-1246; 0268-4012; 0094-114X; 0890-0604; 2071-1050; 1367-5788; 1087-1357; 2227-9717; 1050-0472; 2095-8099; 1741-038X; 0010-4485; 0954-4054; 0020-7543; 1468-8115; 1530-9827; 1742-6596; 2523-3963; 0951-192X; 2234-7593; 0895-6308; 0040-5175; 2261-236X; 1751-7575 +10.1016/j.cirp.2018.04.039,3.0,0951-192X; 1474-0346; 1087-1357 +10.1016/j.cirp.2018.04.055,34.0,1098-1241; 1532-0626; 1868-5137; 2288-6206; 1863-8880; 0268-3768; 2076-3417; 0742-4795; 2332-9017; 1474-0346; 2071-1050; 1367-5788; 1087-1357; 0360-3199; 1050-0472; 2095-8099; 2391-5439; 0954-4054; 0020-7543; 2073-8994; 2504-4494; 0013-7944; 1742-6596; 1729-8814; 0951-192X +10.1016/j.cirp.2018.04.118,2.0,2076-3417; 1063-293X +10.1016/j.cirp.2019.04.011,6.0,0951-192X; 0161-0457; 2076-3417; 0954-4054 +10.1016/j.cirp.2019.04.024,5.0,2076-3417; 0954-4054; 2227-9717; 0020-7543 +10.1016/j.cirp.2019.04.041,1.0,1350-6307 +10.1016/j.cirp.2019.05.010,1.0,2071-1050 +10.1016/j.cirpj.2019.04.007,2.0,0360-5442; 1747-7778 +10.1016/j.cma.2020.112907,2.0,0010-4485; 0178-7675 +10.1016/j.cmpb.2019.01.003,2.0,1617-7959; 1025-5842 +10.1016/j.cobme.2018.04.001,5.0,2296-4185; 0935-9648; 1742-7061; 2054-5703 +10.1016/j.compag.2016.03.005,14.0,0950-5423; 0145-8892; 1866-7910; 2071-1050; 0038-7010; 0022-1155; 2072-4292; 0022-5142; 1537-5110; 2314-4920; 1936-9751 +10.1016/j.compchemeng.2018.09.022,3.0,1735-1472; 1064-1246; 1996-1073 +10.1016/j.compchemeng.2019.106577,1.0,0009-286X +10.1016/j.compind.2019.103130,11.0,2076-3417; 2071-1050; 0263-2241; 1424-8220; 0268-3768; 0020-7543 +10.1016/j.compstruc.2020.106282,3.0,1424-8220 +10.1016/j.ejor.2018.04.032,6.0,2071-1050; 0377-2217; 1868-5137; 1747-7778; 0160-5682; 0920-8542 +10.1016/j.enbuild.2019.07.015,1.0,1940-1493 +10.1016/j.eng.2019.01.014,26.0,1134-3060; 2076-3417; 0009-286X; 1866-7910; 2504-4494; 1999-5903; 2071-1050; 2640-4567; 1424-8220; 2214-7853; 2227-9717; 2523-3963; 2095-8099; 1474-0346; 1868-5137; 2516-8398; 0737-3937; 0954-4054 +10.1016/j.engfracmech.2019.106673,2.0,2214-8604; 2193-9764 +10.1016/j.engfracmech.2019.106674,3.0,1996-1944; 2267-1242; 0307-904X +10.1016/j.engfracmech.2019.106766,3.0,1099-4300; 1996-1944; 2571-631X +10.1016/j.engfracmech.2020.107075,2.0,2071-1050; 0013-7944 +10.1016/j.ergon.2019.02.001,13.0,2076-3417; 1022-1360; 2071-1050; 0263-2241; 1424-8220; 2523-3963; 1955-2513; 1080-3548; 2169-3277; 0268-3768 +10.1016/j.fusengdes.2017.10.012,6.0,2332-9017; 0029-5515; 0268-3768; 0020-7543; 2571-631X +10.1016/j.future.2019.12.020,6.0,2076-3417; 1424-8220; 0929-6212; 1741-0398; 2073-8994 +10.1016/j.ifacol.2015.06.141,77.0,1434-5021; 2578-0727; 1424-8220; 0944-6524; 1532-0626; 1868-5137; 1350-6307; 1076-2787; 0268-3768; 2076-3417; 1092-7026; 2332-9017; 1474-0346; 2227-7080; 1064-1246; 0306-2619; 0268-4012; 0890-0604; 2071-1050; 2156-485X; 1367-5788; 1087-1357; 2227-9717; 2296-4185; 2095-8099; 1099-4300; 1246-0125; 2524-8510; 0954-4054; 0020-7543; 2073-8994; 0177-0667; 0952-1976; 2220-9964; 2267-1242; 1729-8814; 2214-7853; 0951-192X; 2169-3277; 2213-8463; 1617-9846 +10.1016/j.ifacol.2016.11.115,30.0,1996-1944; 2363-7005; 1868-5137; 2288-6206; 0268-3768; 2332-9017; 1474-0346; 2071-1050; 2095-8099; 0010-485X; 1099-4300; 0954-4054; 0020-7543; 2073-8994; 0360-5442; 1742-6596; 2214-7853; 0951-192X; 1751-7575 +10.1016/j.ifacol.2017.08.2360,2.0,0268-3768 +10.1016/j.ifacol.2017.08.902,4.0,0951-192X; 2075-1702; 1042-9247; 0360-5442 +10.1016/j.ifacol.2018.03.104,3.0,2504-4494; 0268-3768; 1678-5878 +10.1016/j.ifacol.2018.06.356,4.0,0920-4105; 1687-8140 +10.1016/j.ifacol.2019.10.024,1.0,0957-0233 +10.1016/j.ifacol.2019.11.383,3.0,0953-7287; 0020-7543; 0268-4012 +10.1016/j.ifacol.2019.11.536,1.0,0953-7287 +10.1016/j.ifacol.2019.11.685,2.0,0178-2312 +10.1016/j.ijinfomgt.2019.05.020,6.0,0926-5805; 2640-4567; 0268-4012; 0360-3199 +10.1016/j.ijmst.2015.09.016,1.0,2071-1050 +10.1016/j.infrared.2015.06.002,5.0,2076-3417; 2071-1050; 0263-2241; 0268-3768; 0948-7921 +10.1016/j.infrared.2019.04.007,4.0,0703-8992; 2076-3417; 0145-8892; 1936-9751 +10.1016/j.isatra.2019.05.011,3.0,0268-3768; 0263-2241; 2227-9717 +10.1016/j.jclepro.2019.04.156,10.0,2412-3811; 2071-1050; 2076-3417; 1424-8220 +10.1016/j.jclepro.2019.119299,2.0,0360-5442; 0890-0604 +10.1016/j.jclepro.2019.119423,4.0,0951-192X; 2071-1050; 1741-0398; 0360-5442 +10.1016/j.jmapro.2013.10.004,10.0,1042-6914; 0043-1648; 1091-0344; 2214-7853; 1876-990X; 0268-3768; 0890-6955; 1678-5878 +10.1016/j.jmatprotec.2013.03.013,57.0,0884-2914; 1438-1656; 1996-1944; 2288-6206; 1350-6307; 0268-3768; 2159-6867; 0264-1275; 1464-4207; 2073-4360; 0925-5273; 2352-4928; 2227-7080; 0928-4931; 2214-8604; 1355-2546; 1359-6454; 1087-1357; 0167-6636; 1073-5623; 1073-5615; 0263-2241; 0734-743X; 0020-7403; 0921-5093; 2075-4701 +10.1016/j.jmsy.2018.02.002,21.0,2076-3417; 2224-2708; 2071-1050; 0263-2241; 1729-8814; 0885-8624; 0951-192X; 2213-8463; 2516-8398; 1741-038X; 0268-3768; 0954-4054; 1064-1246; 0307-904X; 0020-7543 +10.1016/j.jmsy.2018.05.003,25.0,2076-3417; 0890-0604; 1530-9827; 1424-8220; 0268-3768; 1386-7857; 0885-8624; 2234-7593; 2190-7188; 1868-5137; 1463-5771; 1741-038X; 1463-7154; 0020-7543 +10.1016/j.jmsy.2020.04.005,1.0,1996-1073 +10.1016/j.knosys.2014.03.010,4.0,0952-1976; 0950-7051 +10.1016/j.measurement.2011.09.018,2.0,0957-0233; 0263-2241 +10.1016/j.medengphy.2019.08.007,5.0,1350-4533; 2311-5521; 1364-503X; 2040-7939 +10.1016/j.mfglet.2018.02.006,28.0,0306-2619; 2050-7038; 2075-1702; 2079-9292; 0890-0604; 1093-9687; 1742-6596; 1424-8220; 2227-9717; 0951-192X; 0178-2312; 2213-8463; 1350-6307; 0926-5805; 0268-3768; 0007-6813; 0020-7543; 0307-904X +10.1016/j.mfglet.2020.04.004,4.0,2214-8604; 1474-0346; 1996-1073 +10.1016/j.molliq.2016.12.028,11.0,1350-4177; 0041-624X; 0956-053X; 1872-5805; 1615-9306; 0167-7322; 2297-8747; 2398-4902 +10.1016/j.net.2020.03.028,1.0,1076-2787 +10.1016/j.optlastec.2010.07.010,5.0,0732-8818; 1568-4946; 0263-2241; 2523-3963; 2196-7229 +10.1016/j.procir.2016.11.152,68.0,1866-7910; 1574-017X; 1424-8220; 2363-7005; 0944-6524; 1868-5137; 0268-3768; 2075-1702; 2076-3417; 2332-9017; 0141-9331; 1474-0346; 2227-7080; 1996-1073; 0268-4012; 2071-1050; 1367-5788; 2288-5048; 2095-8099; 1687-8140; 1741-038X; 2095-0233; 2391-5439; 0954-4054; 0020-7543; 2224-2708; 0952-1976; 0013-7944; 1742-6596; 2504-4494; 0263-2241; 2214-7853; 0951-192X; 2169-3277; 2572-6668; 1747-7778; 1757-8981; 1687-8086; 2311-5521; 2424-8622 +10.1016/j.procir.2017.02.035,2.0,0951-192X; 0268-3768 +10.1016/j.procir.2017.12.168,9.0,2076-3417; 1742-6596; 0253-3839; 0178-2312; 0954-4054; 1996-1073 +10.1016/j.procir.2018.02.010,5.0,0951-192X; 2071-1050; 2076-3417; 1424-8220 +10.1016/j.procir.2018.03.103,19.0,0161-0457; 1098-1241; 0890-0604; 1367-5788; 1087-1357; 1729-8814; 2267-1242; 2095-8099; 1474-0346; 1532-0626; 0268-3768; 2227-7080; 0005-1055; 1331-677X +10.1016/j.procir.2018.03.139,6.0,1742-6596; 0951-192X; 0268-3768; 2075-1702; 2391-5439 +10.1016/j.procir.2018.03.166,8.0,0178-2312; 1742-6596; 0268-3768; 0890-0604 +10.1016/j.procir.2018.03.178,8.0,0890-0604; 2504-4494; 1087-1357; 1532-0626; 1474-0346; 1350-6307; 0268-3768; 0020-7543 +10.1016/j.procir.2018.03.192,10.0,2076-3417; 0890-0604; 2504-4494; 2071-1050; 1424-8220; 1474-0346; 1747-7778; 0268-3768; 0020-7543 +10.1016/j.procir.2018.04.076,2.0,2071-1050; 2076-3417 +10.1016/j.procir.2018.04.078,6.0,2076-3417; 0094-114X; 1532-0626; 2075-5309; 0268-3768 +10.1016/j.procir.2019.02.087,3.0,0268-3768; 1742-6596; 1350-6307 +10.1016/j.procir.2019.02.104,3.0,2075-1702; 2095-8293; 2267-1242 +10.1016/j.procir.2019.02.131,2.0,2076-3417; 1751-7575 +10.1016/j.procir.2019.03.072,7.0,2076-3417; 2071-1050; 0951-192X; 2516-8398; 2571-631X +10.1016/j.procir.2019.03.141,4.0,0951-192X; 2227-9717; 0020-7543; 1087-1357 +10.1016/j.procir.2019.03.182,1.0,1474-0346 +10.1016/j.procir.2019.03.212,1.0,0268-3768 +10.1016/j.procir.2019.03.223,3.0,2071-1050; 2076-3417; 0268-3768 +10.1016/j.procir.2019.04.040,6.0,1087-1357; 0268-3768; 0171-8096; 0020-7543; 1064-1246 +10.1016/j.procir.2019.04.049,2.0,2079-3197; 0020-7543 +10.1016/j.procir.2019.04.084,6.0,2504-4494; 2076-3417; 1424-8220; 0950-7051 +10.1016/j.procir.2019.04.103,3.0,1866-7511; 2076-3417; 1955-2513 +10.1016/j.procir.2019.04.176,2.0,0926-5805; 0020-7543 +10.1016/j.procir.2019.04.219,1.0,2071-1050 +10.1016/j.procir.2019.04.330,3.0,2071-1050; 2076-3417; 2288-6206 +10.1016/j.procir.2020.01.043,1.0,2267-1242 +10.1016/j.procir.2020.01.049,2.0,2076-3417 +10.1016/j.procir.2020.05.020,1.0,1999-5903 +10.1016/j.procs.2017.09.003,11.0,1098-1241; 2156-485X; 2213-7467; 1530-9827; 1087-1357; 2214-7853; 2523-3963; 2213-8463; 0937-7255; 1687-8086; 0041-624X +10.1016/j.procs.2019.09.032,3.0,2314-4904; 1742-6596; 2424-8622 +10.1016/j.proeng.2014.12.394,6.0,2072-666X; 0264-1275; 1757-8981; 1024-123X; 0143-991X; 0010-4485 +10.1016/j.proeng.2014.12.395,1.0,0268-3768 +10.1016/j.proeng.2015.07.314,10.0,1742-6596; 1757-8981; 1024-123X; 0268-3768; 1755-1307 +10.1016/j.promfg.2017.04.039,8.0,0360-5442; 1742-6596; 2332-9017; 0954-4062; 0951-192X; 0268-3768; 0020-7543 +10.1016/j.promfg.2017.04.043,33.0,0886-7798; 1424-8220; 1868-5137; 0268-3768; 2076-3417; 2332-9017; 2516-8398; 1463-5771; 1996-1073; 2071-1050; 1087-1357; 2227-9717; 1741-038X; 0954-4054; 0020-7543; 0360-5442; 1742-6596; 0951-192X; 2213-8463; 2572-6668; 1751-7575 +10.1016/j.promfg.2017.07.094,20.0,0161-0457; 1742-6596; 1996-1944; 1087-1357; 2267-1242; 0951-192X; 2213-8463; 1350-6307; 0306-4549; 0268-3768; 2075-1702; 0020-7543; 1751-7575 +10.1016/j.promfg.2017.07.198,43.0,2363-7005; 2305-7084; 1350-6307; 0268-3768; 2075-1702; 2076-3417; 1999-5903; 0953-7287; 2516-8398; 1996-1073; 1064-1246; 1134-3060; 2624-6511; 2071-1050; 1087-1357; 2227-9717; 2095-8099; 0020-7543; 1729-8814; 2214-7853; 0951-192X; 1871-6784; 2213-8463; 1747-7778; 0005-1055; 1751-7575 +10.1016/j.promfg.2018.04.008,5.0,1742-6596; 0020-7543; 1754-2731; 1463-5771 +10.1016/j.promfg.2018.06.041,2.0,0268-3768; 2331-1916 +10.1016/j.promfg.2018.06.057,3.0,2234-7593; 0268-3768; 0020-7543 +10.1016/j.promfg.2018.07.146,3.0,0926-5805; 1742-6596; 0951-192X +10.1016/j.promfg.2018.07.155,6.0,0268-3768; 2095-8099; 0954-4054; 0020-7543 +10.1016/j.promfg.2018.10.047,10.0,2076-3417; 1424-8220; 0951-192X; 1050-0472; 2234-7593; 2288-6206; 1064-1246 +10.1016/j.promfg.2018.10.070,1.0,2234-7593 +10.1016/j.promfg.2019.03.035,4.0,2314-4904; 1092-7026; 0954-4054; 1747-7778 +10.1016/j.promfg.2019.06.097,3.0,1050-0472; 2076-3417 +10.1016/j.promfg.2020.02.084,1.0,0268-3768 +10.1016/j.psep.2019.10.021,3.0,2071-1050; 1749-7728 +10.1016/j.rcim.2018.07.006,6.0,2076-3417; 1424-8220; 1080-3548; 0951-192X; 0268-3768 +10.1016/j.rcim.2019.101881,3.0,2071-1050; 0737-8831; 0268-3768 +10.1016/j.rcim.2019.101895,4.0,0161-0457; 0013-7944; 1367-5788; 0268-3768 +10.1016/j.rcim.2019.101917,2.0,1742-6596; 0020-7543 +10.1016/j.resconrec.2019.06.002,3.0,0360-5442; 2071-1050; 2227-9717 +10.1016/j.scriptamat.2016.12.005,31.0,1047-4838; 0022-0434; 2472-5854; 1424-8220; 0267-0836; 0268-3768; 2076-3417; 2073-4360; 2332-9017; 0307-904X; 2079-6412; 2214-8604; 2352-9407; 2071-1050; 1087-1357; 2095-8099; 1674-7321; 0020-7543; 1073-5623 +10.1016/j.ymssp.2019.106612,5.0,2076-3417; 1070-9622; 1367-5788; 1424-8220; 1087-1357 +10.1017/S089006041900012X,9.0,2076-3417; 2577-8196; 0890-0604; 2504-4494; 1742-6596; 1359-4311; 1474-0346; 1996-1073 +10.1038/s41467-020-19059-3,2.0,1616-301X; 2397-4621 +10.1049/iet-cim.2020.0009,1.0,1367-5788 +10.1049/iet-cim.2020.0041,9.0,2076-3417; 2424-8622 +10.1049/iet-epa.2018.5732,1.0,1367-5788 +10.1051/e3sconf/201912301049,1.0,2267-1242 +10.1051/e3sconf/201914004017,1.0,2267-1242 +10.1051/e3sconf/201914006008,2.0,2267-1242 +10.1051/matecconf/201930011004,2.0,2214-8604; 1047-4838 +10.1061/(ASCE)CO.1943-7862.0000825,1.0,1474-0346 +10.1061/(ASCE)ME.1943-5479.0000740,4.0,0926-5805; 2071-1050; 1474-0346; 1424-8220 +10.1061/(ASCE)ME.1943-5479.0000741,3.0,2071-1050; 2227-9717; 0360-3199 +10.1061/(ASCE)ME.1943-5479.0000745,3.0,1093-9687; 2364-8228; 1869-5450 +10.1061/(ASCE)ME.1943-5479.0000748,1.0,1424-8220 +10.1061/(ASCE)ME.1943-5479.0000774,1.0,2079-9292 +10.1061/(ASCE)ME.1943-5479.0000779,1.0,1562-3599 +10.1061/(ASCE)ME.1943-5479.0000797,1.0,0926-5805 +10.1063/1.5031520,7.0,0001-9240; 2071-1050; 1548-1115; 0010-485X; 0268-3768; 0020-7543; 2571-631X +10.1063/1.5034337,1.0,1424-8220 +10.1063/1.5099723,2.0,0888-3270; 1475-9217 +10.1063/1.5128374,2.0,0307-904X; 2311-5521 +10.1080/00207543.2018.1443229,33.0,2076-3417; 2071-1050; 2332-9017; 1729-8814; 0951-192X; 1050-0472; 1868-5137; 2516-8398; 0268-3768; 0020-7543; 0307-904X; 2073-8994 +10.1080/00207543.2018.1471243,31.0,2076-3417; 0096-3003; 2071-1050; 0013-7944; 1424-8220; 1729-8814; 0951-192X; 2234-7593; 2190-7188; 0926-5805; 0268-3768; 0954-4054; 1562-2479; 0020-7543 +10.1080/00207543.2018.1497819,11.0,0360-5442; 2071-1050; 0951-192X; 2516-8398; 0020-7543 +10.1080/00207543.2018.1552032,22.0,2076-3417; 1070-9622; 2071-1050; 1367-5788; 1729-8814; 0360-3199; 1024-123X; 0268-3768; 0020-7543; 1996-1073; 0307-904X; 2073-8994 +10.1080/00207543.2019.1566661,34.0,0965-9978; 2076-3417; 2504-4494; 2071-1050; 1367-5788; 1424-8220; 2624-9375; 0951-192X; 1469-1930; 1474-0346; 0950-7051; 0268-3768; 1743-6753; 0954-4054; 0020-7543 +10.1080/00207543.2019.1581387,9.0,2076-3417; 1530-9827; 1424-8220; 1050-0472; 0268-3768; 0020-7543 +10.1080/00207543.2019.1607978,14.0,0360-5442; 2076-3417; 1424-8220; 1087-1357; 2095-8099; 0268-3768; 0020-7543 +10.1080/00207543.2019.1662133,4.0,0268-3768; 2095-8099; 0020-7543 +10.1080/00207543.2019.1683250,3.0,0020-7543; 2227-9717 +10.1080/00207543.2020.1714091,3.0,0020-7543; 0021-9983 +10.1080/01691864.2017.1297735,2.0,0926-5805; 1424-8220 +10.1080/01969722.2019.1705554,1.0,0360-5442 +10.1080/07373937.2014.962143,19.0,1350-4177; 0360-5442; 2076-3417; 0145-8892; 1866-7910; 0737-3937; 0947-7411; 2352-4847; 0145-8876 +10.1080/0951192X.2018.1529430,15.0,2076-3417; 1530-9827; 2332-9017; 1424-8220; 1080-3548; 0951-192X; 0268-3768 +10.1080/0951192X.2019.1599433,1.0,0020-7543 +10.1080/0951192X.2019.1599436,6.0,0951-192X; 0268-3768; 2076-3417; 2504-4494 +10.1080/0951192X.2019.1599439,9.0,2071-1050; 0263-2241; 0951-192X; 2288-6206; 2190-7188; 0268-3768; 0020-7543; 1064-1246 +10.1080/0951192X.2019.1686173,9.0,2076-3417; 2071-1050; 0263-2241; 1424-8220; 1087-1357; 0951-192X; 0020-7543 +10.1080/0951192X.2019.1699254,2.0,2071-1050; 2076-3417 +10.1080/0951192X.2020.1747642,1.0,2168-1163 +10.1080/10494820.2013.815221,55.0,1361-4568; 1359-4338; 1574-017X; 1360-2357; 2472-5854; 1463-922X; 1955-2513; 1868-5137; 0268-3768; 1049-4820; 2077-1312; 2095-2686; 0007-1013; 2076-3417; 1080-3548; 0018-7208; 2468-2322; 0141-9331; 0735-6331; 0144-929X; 1042-1629; 2073-431X; 1044-8004; 2071-1050; 0265-671X; 2414-4088; 0020-7543; 1044-7318; 2296-9144; 0952-1976; 1742-6596; 2261-236X +10.1080/14786451.2012.724070,21.0,1744-2591; 0199-6231; 1556-3758; 1866-7910; 0737-3937; 2095-1701; 2071-1050; 1556-7036; 2523-3963; 0143-0750; 0947-7411; 1543-5075; 2352-4847; 1064-1246; 2050-0505 +10.1080/15732479.2019.1620789,4.0,1573-2479; 1687-8086; 2076-3417 +10.1080/16864360.2018.1462569,9.0,0360-5442; 1530-9827; 2193-9764; 2332-9017; 0951-192X; 1050-0472; 0954-4054 +10.1080/17517575.2018.1526324,14.0,2076-3417; 0888-3270; 0954-4054; 2071-1050; 1424-8220; 1729-8814; 2227-9717; 0951-192X; 2234-7593; 2391-5439; 0020-7543 +10.1080/21693277.2019.1660283,3.0,0951-192X; 0268-3768; 2076-3417 +10.1080/25726668.2019.1569367,5.0,0161-0457; 0360-3199; 0951-192X; 2572-6668; 2073-8994 +10.1080/25726668.2019.1645519,2.0,2572-6668; 2073-8994 +10.1088/0957-0233/24/9/095401,1.0,0040-5175 +10.1088/1674-4527/20/5/67,1.0,1674-4527 +10.1088/1742-6596/1168/2/022044,1.0,1573-062X +10.1088/1742-6596/1391/1/012083,1.0,2571-631X +10.1088/1742-6596/1618/2/022065,1.0,2071-1050 +10.1088/1757-899X/156/1/012002,1.0,1742-6596 +10.1088/1757-899X/324/1/012077,6.0,1134-3060; 2076-3417; 2640-4567; 1359-4311; 0020-7543 +10.1088/1757-899X/459/1/012075,1.0,0040-5175 +10.1088/1757-899X/711/1/012017,1.0,2079-6412 +10.1088/1757-899X/739/1/012048,1.0,2261-236X +10.1108/CI-11-2019-0133,4.0,0926-5805; 1562-3599; 0969-9988; 2071-1050 +10.1108/ECAM-11-2019-0640,3.0,2071-1050; 1472-5967 +10.1108/IMDS-01-2018-0033,5.0,0360-5442; 0268-3768; 0951-192X; 1741-038X; 1463-7154 +10.1108/RPJ-06-2012-0058,15.0,2214-8604; 2411-5134; 1355-2546; 1738-494X; 2523-3963; 0268-3768; 1955-2513; 2631-8695 +10.1108/RPJ-12-2016-0210,4.0,2076-3417; 0268-3768; 0010-4485; 0730-6679 +10.1109/ACCESS.2017.2657006,57.0,2079-9292; 1077-5463; 1424-8220; 1532-0626; 1868-5137; 2288-6206; 0268-3768; 2076-3417; 2334-5837; 1386-145X; 2332-9017; 0141-9331; 1474-0346; 0926-5805; 1996-1073; 2050-7038; 0890-0604; 2071-1050; 2227-9717; 0010-485X; 1074-5351; 0954-4054; 0020-7543; 2078-2489; 2073-8994; 1742-6596; 0951-192X; 2213-8463; 2572-6668; 0929-6212; 1747-7778; 1617-9846; 1751-7575; 2424-8622 +10.1109/ACCESS.2017.2756069,56.0,2040-4166; 0954-4062; 1868-5137; 2288-6206; 0268-3768; 2075-1702; 2076-3417; 0953-7287; 1474-0346; 2516-8398; 2227-7080; 0965-9978; 2071-1050; 2156-485X; 1367-5788; 2227-9717; 2095-8099; 0954-4054; 0020-7543; 0360-5442; 1530-9827; 1742-6596; 1729-8814; 0951-192X; 2234-7593; 1747-7778 +10.1109/ACCESS.2017.2766453,35.0,2050-7038; 2076-3417; 2079-9292; 0890-0604; 1424-8220; 1729-8814; 2267-1242; 0951-192X; 0944-6524; 1050-0472; 1868-5137; 2095-8099; 0268-3768; 2075-1702; 0954-4054; 2391-5439; 0020-7543 +10.1109/ACCESS.2018.2793265,66.0,1573-062X; 1866-7910; 2041-1723; 2079-9292; 2079-3197; 1424-8220; 0029-5981; 1868-5137; 2288-6206; 0268-3768; 1741-0398; 2075-1702; 2333-5777; 2076-3417; 1746-5664; 1478-0771; 0953-7287; 1092-7026; 2332-9017; 1471-4175; 1474-0346; 0138-9130; 0926-5805; 2227-7080; 2352-9407; 2071-1050; 1367-5788; 1087-1357; 1469-1930; 2095-8099; 0265-671X; 0954-4054; 0020-7543; 0360-5442; 2058-8437; 1742-6596; 0951-192X; 1476-1122 +10.1109/ACCESS.2018.2890566,7.0,2076-3417; 0888-3270; 1367-5788; 0951-192X; 0925-2312; 2078-2489 +10.1109/ACCESS.2019.2891060,5.0,1367-5788; 0020-7543; 1729-8814 +10.1109/ACCESS.2019.2893309,5.0,0161-0457; 0951-192X; 1678-5878; 1063-293X; 0020-7543 +10.1109/ACCESS.2019.2897018,4.0,0268-3768; 0013-7944; 1424-8220; 1751-7575 +10.1109/ACCESS.2019.2909828,11.0,1134-3060; 2079-9292; 2398-6352; 1617-4909; 1424-8220; 2214-7853; 0010-485X; 1793-9623; 1751-7575 +10.1109/ACCESS.2019.2923610,8.0,2076-3417; 2640-4567; 1742-6596; 0263-2241; 1424-8220; 2524-521X; 2095-8099 +10.1109/ACCESS.2019.2928141,9.0,2076-3417; 1424-8220; 2072-4292; 1678-5878; 2073-8994 +10.1109/ACCESS.2019.2946515,4.0,2071-1050; 1367-5788; 0969-9988 +10.1109/ACCESS.2019.2950507,2.0,2076-3417; 1424-8220 +10.1109/ACCESS.2019.2950955,1.0,1367-5788 +10.1109/ACCESS.2019.2953499,7.0,2076-3417; 1098-1241; 2073-431X; 1424-8220; 2213-8463 +10.1109/ACCESS.2020.2970143,10.0,2076-3417; 2071-1050; 1742-6596; 2470-0045; 2227-9717 +10.1109/ACCESS.2020.2971576,1.0,2073-431X +10.1109/ACCESS.2020.2974241,1.0,0268-3768 +10.1109/ACCESS.2020.2974810,3.0,2504-4494; 2071-1050; 1424-8220 +10.1109/ACCESS.2020.2981745,13.0,2076-3417; 2161-3915; 1225-6463; 2079-9292; 2071-1050; 1999-5903; 2327-0012; 1424-8220; 1687-1499; 2424-8622 +10.1109/ACCESS.2020.2998358,8.0,2076-3417; 2071-1050; 1742-6596; 1424-8220; 0096-3003; 1996-1073; 2571-631X +10.1109/ACCESS.2020.2998723,3.0,1050-0472; 1474-0346; 2571-631X +10.1109/ACCESS.2020.2999871,2.0,2040-2295; 1424-8220 +10.1109/ACCESS.2020.3000437,1.0,1424-8220 +10.1109/AIM.2018.8452707,1.0,0951-192X +10.1109/BigData.2018.8622412,1.0,0020-7543 +10.1109/CIST.2018.8596460,1.0,0268-3768 +10.1109/COASE.2019.8842888,1.0,2267-1242 +10.1109/COASE.2019.8843166,2.0,2214-8604 +10.1109/COASE.2019.8843269,1.0,0361-7688 +10.1109/CyberC.2014.30,1.0,1942-4787 +10.1109/EDOCW.2018.00021,2.0,0951-192X; 0020-7543 +10.1109/EIConRus.2019.8656681,1.0,1229-7607 +10.1109/EMBC.2014.6943963,1.0,1955-2513 +10.1109/ETFA.2017.8247583,3.0,2288-5048; 1424-8220; 2411-5134 +10.1109/ETFA.2017.8247712,2.0,0951-192X; 1687-8086 +10.1109/ETFA.2018.8502467,3.0,0178-2312; 2424-8622 +10.1109/ETFA.2019.8868954,1.0,0268-3768 +10.1109/GHTC46095.2019.9033075,1.0,1999-5903 +10.1109/GLOBECOM38437.2019.9013428,1.0,2076-3417 +10.1109/HRI.2019.8673015,1.0,2296-9144 +10.1109/ICAC.2016.29,13.0,1433-2779; 0951-192X; 2364-415X; 1474-0346; 0010-485X; 2288-6206; 1741-038X; 0268-3768; 0020-7543 +10.1109/ICCCBDA.2018.8386518,1.0,0268-3768 +10.1109/ICCVE45908.2019.8965086,1.0,1424-8220 +10.1109/ICE.2019.8792577,1.0,1092-7026 +10.1109/ICE.2019.8792613,1.0,2523-3963 +10.1109/ICE.2019.8792622,1.0,2523-3963 +10.1109/ICE/ITMC49519.2020.9198403,1.0,2076-3417 +10.1109/ICIMTech.2019.8843814,1.0,2267-1242 +10.1109/ICITM48982.2020.9080395,1.0,1999-5903 +10.1109/ICNSC.2018.8361283,1.0,0268-3768 +10.1109/ICNSC.2018.8361285,1.0,1474-0346 +10.1109/ICNSC.2018.8361293,3.0,0268-3768; 1742-6596 +10.1109/ICSE-NIER.2019.00011,1.0,2524-8510 +10.1109/ICTC.2018.8539690,1.0,0951-192X +10.1109/IECON.2018.8591464,2.0,2076-3417; 0009-286X +10.1109/IECON.2018.8591653,1.0,1742-6596 +10.1109/IEEM.2017.8289898,5.0,0951-192X; 0268-3768; 2076-3417; 2424-8622 +10.1109/INDIN.2016.7819217,8.0,2071-1050; 0951-192X; 1868-5137; 0937-7255; 0268-3768; 0020-7543 +10.1109/INDIN.2018.8471979,3.0,0268-3768; 2210-464X; 2424-8622 +10.1109/INDIN.2018.8472014,2.0,0951-192X; 2504-4494 +10.1109/INDIN.2018.8472083,1.0,2504-4494 +10.1109/INDIN41052.2019.8972134,2.0,2504-4494; 1424-8220 +10.1109/INDIN41052.2019.8972267,1.0,2267-1242 +10.1109/IROS.2016.7759171,1.0,0253-3839 +10.1109/IS.2018.8710526,1.0,1742-6596 +10.1109/ISIE.2019.8781529,1.0,2523-3963 +10.1109/ITAIC.2019.8785703,1.0,2523-3963 +10.1109/JPROC.2017.2725482,6.0,2051-3305; 2079-9292; 0164-1212; 2213-8463; 1049-8923; 1687-1499 +10.1109/JPROC.2020.2998530,5.0,1871-6784; 1999-5903; 2076-3417; 2192-113X +10.1109/JSYST.2019.2925627,1.0,1996-1073 +10.1109/KBEI.2015.7436192,1.0,2214-7853 +10.1109/M2VIP.2018.8600844,2.0,2076-3417; 1474-0346 +10.1109/MLSD.2018.8551867,1.0,2267-1242 +10.1109/MSP.2018.2842228,3.0,2476-1508; 0268-3768 +10.1109/MetroInd4.0IoT48571.2020.9138264,1.0,1996-1073 +10.1109/NAPS46351.2019.9000371,1.0,1999-5903 +10.1109/PICMET.2016.7806826,2.0,2161-3915 +10.1109/RUSAUTOCON.2019.8867800,1.0,1742-6596 +10.1109/SAS.2019.8706111,1.0,1999-5903 +10.1109/SEsCPS.2019.00012,2.0,0953-5438; 2076-3417 +10.1109/SIBIRCON48586.2019.8958367,1.0,0361-7688 +10.1109/SII.2017.8279217,1.0,1742-6596 +10.1109/SPEEDAM.2018.8445302,1.0,0937-7255 +10.1109/SSCI.2017.8285439,1.0,0003-682X +10.1109/SYSCON.2017.7934796,3.0,0041-624X; 2334-5837 +10.1109/SYSOSE.2018.8428748,1.0,0951-192X +10.1109/SYSOSE.2019.8753845,1.0,0268-3768 +10.1109/SYSOSE.2019.8753860,1.0,2523-3963 +10.1109/SysEng.2016.7753162,7.0,1742-6596; 2214-7853; 0951-192X; 2213-8463; 0268-3768; 0005-1055; 1996-1073 +10.1109/TII.2018.2804917,19.0,2076-3417; 2071-1050; 1092-7026; 1741-0401; 1424-8220; 0951-192X; 0141-9331; 0178-2312; 1868-5137; 1350-6307; 0926-5805; 0268-3768; 2075-1702; 0020-7543 +10.1109/TII.2018.2873186,46.0,0888-3270; 1573-062X; 1098-1241; 1424-8220; 1868-5137; 0268-3768; 0026-2714; 2076-3417; 1474-0346; 0926-5805; 1996-1073; 0307-904X; 0306-2619; 2071-1050; 1367-5788; 1087-1357; 0950-7051; 0160-5682; 0954-4054; 0020-7543; 2073-8994; 0360-5442; 0013-7944; 1742-6596; 2470-0045; 2523-3963; 2214-7853; 0951-192X +10.1109/TII.2019.2938885,1.0,1996-1073 +10.1109/TII.2020.2977113,1.0,1424-8220 +10.1109/TLA.2020.9082917,1.0,1999-5903 +10.1109/TPEL.2019.2911594,8.0,1367-5788; 1742-6596; 0263-2241; 2194-5756; 0894-3370; 1996-1073; 2073-8994 +10.1109/TSMC.2019.2930418,4.0,2071-1050; 0020-7543; 1424-8220 +10.1109/TWC.2019.2927312,2.0,0268-3768; 1999-4893 +10.1109/UCC-Companion.2018.00039,1.0,0361-7688 +10.1109/WF-IoT.2018.8355217,1.0,2214-7853 +10.1109/WSC.2018.8632242,3.0,2076-3417; 2523-3963; 2213-8463 +10.1109/WSC40007.2019.9004659,2.0,2076-3417; 0020-7543 +10.1111/caim.12082,8.0,0963-1690; 0954-4062; 1363-9196; 1955-2513; 0934-9839 +10.1111/cgf.14023,1.0,0167-7055 +10.1111/exsy.12064,7.0,1042-6914; 0952-1976; 0254-5330; 0377-2217; 0361-2317; 1687-5265; 0950-7051 +10.1117/12.2042170,2.0,1674-1056; 1644-9665 +10.1134/S0005117915080111,1.0,0005-1179 +10.1145/3061639.3079847,1.0,2214-8604 +10.1155/2011/154798,65.0,0888-3270; 1047-4838; 0001-1452; 1868-5137; 1350-6307; 2364-415X; 0268-3768; 2075-1702; 1687-5966; 2076-3417; 2193-9764; 2332-9017; 2468-2322; 1475-9217; 0926-5805; 0307-904X; 0161-0457; 0268-4012; 2071-1050; 2156-485X; 1367-5788; 2227-9717; 0360-3199; 1050-0472; 0020-7543; 0022-460X; 1478-422X; 0013-7944; 1530-9827; 0951-192X; 2213-8463; 1687-8086; 2261-236X; 1751-7575 +10.1155/2013/263218,1.0,1687-725X +10.1155/2014/439278,17.0,0376-9429; 2451-9049; 2071-1050; 0013-7944; 1367-5788; 2332-9017; 1424-8220; 2214-7853; 0094-9930; 1868-5137; 2364-415X; 0268-3768; 2571-631X +10.1155/2014/648562,2.0,0264-1275; 2073-4360 +10.1155/2020/8888876,2.0,1424-8220 +10.1177/0040517516632471,5.0,1558-9250; 0040-5175 +10.1177/0954405412463857,14.0,2214-8604; 1355-2546; 2050-7526; 0264-1275; 1087-1357; 1779-6288; 1050-0472; 1745-2759; 0268-3768; 0954-4054 +10.1177/0954405413500663,1.0,1748-006X +10.1177/0954406212473037,4.0,1674-7321; 0954-4828; 0954-4062 +10.1177/0954406219854466,3.0,1050-0472; 1424-8220; 1087-1357 +10.1177/1847979019828570,2.0,1847-9790 +10.1186/s40323-020-00147-4,1.0,0268-3768 +10.13196/j.cims.2017.01.001,1.0,1868-5137 +10.13196/j.cims.2017.04.010,1.0,2076-3417 +10.13196/j.cims.2017.08.001,1.0,0951-192X +10.13196/j.cims.2019.01.001,2.0,1687-8086; 1742-6596 +10.1515/auto-2017-0133,4.0,0178-2312; 1093-9687; 2073-8994 +10.1515/auto-2019-0039,5.0,0178-2312; 2076-3417; 1367-5788; 1742-6596 +10.1515/eng-2020-0039,1.0,1424-8220 +10.1515/eng-2020-0040,1.0,2076-3417 +10.1515/itit-2017-0038,1.0,0951-192X +10.1515/mspe-2019-0004,2.0,2076-3417 +10.1515/orga-2017-0017,20.0,2076-3417; 1746-5664; 2043-9377; 2071-1050; 0001-5970; 0953-7287; 1955-2513; 0951-192X; 1868-5137; 2288-6206; 1747-7778; 1366-5545; 0954-4054 +10.15439/2017F253,1.0,1424-8220 +10.21917/ijsc.2016.0173,2.0,1947-9344; 0256-2499 +10.2514/1.J055201,25.0,0888-3270; 1098-1241; 0001-1452; 1424-8220; 0748-8017; 0029-5981; 1868-5137; 1350-6307; 0268-3768; 2076-3417; 2332-9017; 0307-904X; 2571-631X; 1367-5788; 0954-4054; 2073-8994; 0013-7944; 1530-9827; 0951-192X; 0892-7219; 1024-123X; 1573-2479 +10.3103/S1068798X19060194,1.0,1068-798X +10.3103/S1068798X19100101,1.0,1068-798X +10.3103/S1068798X19120104,2.0,1742-6596; 1996-1073 +10.3233/FI-2020-1943,2.0,2078-2489; 1367-4803 +10.3390/app10020486,2.0,2076-3417; 2414-4088 +10.3390/app10072377,2.0,2076-3417; 1424-8220 +10.3390/app10082854,1.0,1424-8220 +10.3390/app10103342,3.0,2076-3417 +10.3390/app10103633,1.0,2076-3417 +10.3390/app10134678,2.0,2071-1050; 1424-8220 +10.3390/app10186519,2.0,2076-3417 +10.3390/app9183780,8.0,2076-3417; 2075-5309; 1471-4175 +10.3390/app9245567,5.0,1099-4300; 2076-3417 +10.3390/designs4020009,1.0,1742-6596 +10.3390/electronics9020319,6.0,2079-6374; 1424-8220; 1996-1073 +10.3390/en12101909,8.0,2079-6412; 2073-4352; 1996-1073 +10.3390/en12122389,7.0,0360-5442; 2073-4352; 1996-1073 +10.3390/en13184762,1.0,1996-1073 +10.3390/en13184979,1.0,1996-1944 +10.3390/en13205413,1.0,2075-1702 +10.3390/fi12090159,1.0,2076-3417 +10.3390/fi12100163,1.0,2076-3417 +10.3390/ijgi6070208,1.0,2267-1242 +10.3390/ijgi9040228,3.0,2071-1050; 2412-3811; 1996-1073 +10.3390/jmmp4030092,1.0,2504-4494 +10.3390/jmse8030200,3.0,2076-3417; 1996-1073; 2077-1312 +10.3390/machines7010002,3.0,0141-9331; 1076-2787; 2076-3417 +10.3390/mi11060614,3.0,0264-1275; 2073-4360; 0032-3888 +10.3390/pr7020094,12.0,1871-6784; 2227-9717; 2296-4185 +10.3390/pr7080537,1.0,0263-2241 +10.3390/pr8070866,1.0,2227-9717 +10.3390/s17112488,12.0,2076-3417; 1424-8220; 2504-446X; 2072-4292; 1385-2256; 0378-3774; 0143-1161; 2078-2489 +10.3390/s19173781,6.0,2227-9717; 1424-8220 +10.3390/s19204410,1.0,0268-3768 +10.3390/s20010097,4.0,2071-1050; 2076-3417; 1424-8220 +10.3390/s20041187,1.0,1687-8086 +10.3390/s20123515,2.0,1424-8220 +10.3390/s20133709,2.0,2071-1050; 1424-8220 +10.3390/s20164637,1.0,2073-8994 +10.3390/s20175003,1.0,1424-8220 +10.3390/su11010159,10.0,2071-1050; 1940-1493; 1996-1073 +10.3390/su11185036,1.0,1024-123X +10.3390/su12030936,3.0,2071-1050; 1424-8220 +10.3390/su12031088,7.0,2076-3417; 2071-1050; 1742-6596; 1424-8220; 1474-0346; 0020-7543 +10.3390/su12062286,6.0,2071-1050; 2076-3417; 1996-1073 +10.3390/su12062307,5.0,2624-6511; 2071-1050; 2220-9964; 1424-8220 +10.3390/su12072940,1.0,2071-1050 +10.3991/ijoe.v13i08.7270,1.0,2218-6581 +10.4028/www.scientific.net/AMM.575.493,9.0,0178-7675; 0263-2241; 1424-8220; 0264-4401; 0268-3768; 0941-0643; 0217-9792; 2053-1591 +10.4028/www.scientific.net/AMR.472-475.206,1.0,0020-7403 +10.4028/www.scientific.net/MSF.957.340,1.0,1022-1360 diff --git a/doi_parse/合并统计.py b/doi_parse/合并统计.py new file mode 100644 index 0000000..544ca86 --- /dev/null +++ b/doi_parse/合并统计.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# @Time : 2022/6/6 10:26 +# @Author : ZhaoXiangPeng +# @File : 合并统计.py + +import pandas as pd +from 数据处理.文件合并 import join, load_file + +path = 'E:/inspec合并/' +files = load_file(path) +# files = [f'{path}{f}' for f in files] +print(files) + + +def count(file_name: str): + temp_df: pd.DataFrame = pd.read_csv(file_name) + # group: pd.Series = temp_df.groupby(by=['doi'])['count'].sum() + # return pd.DataFrame(data=group) + return temp_df + + +df0 = pd.read_csv(r'F:\工作数据存储2022\20220526_inspec测试\Digital twin searching result 2.csv') +df0 = df0[['DI']] +df0.drop_duplicates(inplace=True) +big_df = pd.DataFrame() +for f in files: + t_df = count(path+f) + # 重设索引 以doi链接获取count + t_df = t_df.reset_index() + ts = pd.merge(df0, t_df, how='left', left_on=['DI'], right_on=['doi']) + ts = ts[ts['count'].notnull()] + big_df = pd.concat([big_df, ts], ignore_index=True) +pp = [] +group_2 = big_df.groupby(by=['doi']) +for _, g in group_2: + row = {'doi': _, 'count': g['count'].sum(), 'issn': '; '.join(list(set(g['issn'])))} + pp.append(row) +df = pd.DataFrame(data=pp) +df.to_csv('inspec数据库施引测试数据.csv', index=False) +