|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2025/1/8 14:28
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : 20250108_func.py
|
|
|
# 1.引文表构建
|
|
|
# 1.1.解析发文表的引文字段
|
|
|
# 1.2.下载有doi的引文的完整记录
|
|
|
# 1.3.对有doi的引文进行完整记录匹配
|
|
|
# 1.4.没有doi的发文要保留解析的其他信息 (CR字段扩展.csv)
|
|
|
# 1.5.需要保留发文的唯一id用来匹配 (标准doi扩展完整字段.csv)
|
|
|
# 2.发文表构建
|
|
|
# 2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者
|
|
|
# 2.2.发文信息的结果与引文信息的结果进行匹配
|
|
|
|
|
|
import os
|
|
|
import re
|
|
|
|
|
|
import pandas as pd
|
|
|
from pymongo import MongoClient
|
|
|
|
|
|
READ_PATH = 'Y:\\wos-metadata\\SCHOOL\\bnu'
|
|
|
ROOT_PATH = 'Y:\数据采集需求\陶思琪20211203'
|
|
|
|
|
|
DOI_SPLIT_SYMBOL = 'DOI '
|
|
|
DOI_SPLIT_SYMBOL_LENGTH = len(DOI_SPLIT_SYMBOL)
|
|
|
INFO_SPLIT_SYMBOL = ', '
|
|
|
INFO_SPLIT_SYMBOL_LENGTH = len(INFO_SPLIT_SYMBOL)
|
|
|
|
|
|
# 引文表需要有一个UT字段用来作为主键
|
|
|
REF_RECORD_TABLE_FIELD = ['UT', 'doi']
|
|
|
# 完整记录要保留的字段, {'exported.%s' % key.lower() : 1 for key in FULL_RECORD_TABLE_FIELD}
|
|
|
FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'DT', 'UT', 'AB', 'SN', 'EI', 'BN', 'PY']
|
|
|
TABLE_HEAD_TRANS = {k.lower(): k for k in FULL_RECORD_TABLE_FIELD}
|
|
|
|
|
|
MONGODB_REMOTE_CONFIG = dict(
|
|
|
host='101.43.239.105',
|
|
|
port=27017,
|
|
|
db='science2',
|
|
|
username='science-dev',
|
|
|
password='kcidea1509!%25)(',
|
|
|
url='mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true'
|
|
|
)
|
|
|
|
|
|
client: MongoClient = MongoClient(MONGODB_REMOTE_CONFIG.get("url"))
|
|
|
database = client.get_database(MONGODB_REMOTE_CONFIG.get('db'))
|
|
|
|
|
|
|
|
|
def find_doi_data_from_mongo(doi_list: list):
|
|
|
collection = database.get_collection('data_wos_article')
|
|
|
find_results = collection.find(
|
|
|
filter={"exported.di": {"$in": doi_list}},
|
|
|
projection={"_id": 0, "third_id": 1, 'exported.ab': 1, 'exported.bn': 1, 'exported.di': 1, 'exported.ei': 1,
|
|
|
'exported.is': 1, 'exported.dt': 1, 'exported.sn': 1, 'exported.so': 1}
|
|
|
).collation({"locale": "en", "strength": 2}) # 忽略大小写
|
|
|
for document in find_results:
|
|
|
exported: dict = document.get('exported')
|
|
|
third_id = document.get('third_id')
|
|
|
exported.setdefault('ut', third_id)
|
|
|
yield exported
|
|
|
|
|
|
|
|
|
def step_2_2(base_table: pd.DataFrame, ref_table: pd.DataFrame):
|
|
|
"""
|
|
|
2.2.发文信息的结果与引文信息的结果进行匹配
|
|
|
"""
|
|
|
out_table = base_table.merge(right=ref_table, how='left', left_on=['wos'], right_on=['third_id'])
|
|
|
return out_table
|
|
|
|
|
|
|
|
|
def verify_doi(text) -> str:
|
|
|
doi_idx = text.find(DOI_SPLIT_SYMBOL)
|
|
|
if doi_idx != -1:
|
|
|
doi_str = text[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
|
|
|
else:
|
|
|
doi_str = text
|
|
|
if doi_str.endswith(')'):
|
|
|
doi_str = doi_str[:-1]
|
|
|
return doi_str
|
|
|
|
|
|
|
|
|
def ref_str2dic(text):
|
|
|
ref_list = text.split('; ')
|
|
|
for ref in ref_list:
|
|
|
# print(ref)
|
|
|
"""
|
|
|
"""
|
|
|
# 解析引文字段的信息
|
|
|
ref_copy = ref
|
|
|
var1 = []
|
|
|
for _ in range(3):
|
|
|
idx_t = ref_copy.find(INFO_SPLIT_SYMBOL)
|
|
|
var1.append(ref_copy[:idx_t])
|
|
|
ref_copy = ref_copy[idx_t + INFO_SPLIT_SYMBOL_LENGTH:]
|
|
|
au, py, so = var1
|
|
|
# 年份要判断,要满足4位数字
|
|
|
if not re.match(r'\d{4}', py):
|
|
|
py = None
|
|
|
doi_idx = ref.find(DOI_SPLIT_SYMBOL)
|
|
|
# 把参考文献字段也加进去
|
|
|
model = dict(au=au, py=py, so=so, ref=ref)
|
|
|
if doi_idx != -1:
|
|
|
doi_text = ref[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
|
|
|
if doi_text.startswith('['):
|
|
|
doi_mutil_text = doi_text[1:-1]
|
|
|
doi_list = doi_mutil_text.split(', ')
|
|
|
for doi_str in doi_list:
|
|
|
if doi := verify_doi(doi_str):
|
|
|
obj = dict(doi=doi)
|
|
|
obj.update(**model)
|
|
|
yield obj
|
|
|
else:
|
|
|
if doi := verify_doi(doi_text):
|
|
|
obj = dict(doi=doi)
|
|
|
obj.update(**model)
|
|
|
yield obj
|
|
|
else:
|
|
|
obj = dict(doi=None)
|
|
|
obj.update(**model)
|
|
|
yield obj
|
|
|
|
|
|
|
|
|
def step_1_1():
|
|
|
"""
|
|
|
1.1.解析发文表的引文字段
|
|
|
"""
|
|
|
school_list = os.listdir(READ_PATH)
|
|
|
for ff in school_list:
|
|
|
if ff == 'doi展开' or ff == '待下载DOI':
|
|
|
continue
|
|
|
clear = []
|
|
|
school_path = os.path.join(READ_PATH, ff)
|
|
|
file_list = os.listdir(school_path)
|
|
|
for file in file_list:
|
|
|
f = os.path.join(school_path, file)
|
|
|
table = pd.read_csv(f, sep='\t', error_bad_lines=False)
|
|
|
table = table[['UT', "CR"]]
|
|
|
values = table.values.tolist()
|
|
|
for value in values:
|
|
|
third_id, article_reference_text = value
|
|
|
if pd.isna(article_reference_text):
|
|
|
continue
|
|
|
_g = ref_str2dic(article_reference_text)
|
|
|
for r_doi in _g:
|
|
|
r_doi.setdefault('third_id', third_id)
|
|
|
clear.append(r_doi)
|
|
|
|
|
|
table2 = pd.DataFrame(clear)
|
|
|
table2.to_csv(os.path.join(os.path.join('Y:\数据采集需求\陶思琪20211203', 'doi展开'),
|
|
|
'%s.csv' % os.path.basename(school_path)),
|
|
|
index=False)
|
|
|
|
|
|
|
|
|
def step_1_2():
|
|
|
"""
|
|
|
1.2.下载有doi的引文的完整记录
|
|
|
"""
|
|
|
BATCH_DOWNLOAD_LIMIT = 500
|
|
|
doi_table = pd.read_csv(os.path.join(ROOT_PATH, '待下载DOI.csv'))
|
|
|
extend_field_list = []
|
|
|
tasks = doi_table['DO'].values.tolist()
|
|
|
for i in range(0, len(tasks), BATCH_DOWNLOAD_LIMIT):
|
|
|
batch_tasks = tasks[i: i + BATCH_DOWNLOAD_LIMIT]
|
|
|
g = find_doi_data_from_mongo(doi_list=batch_tasks)
|
|
|
for doc in g:
|
|
|
extend_field_list.append(doc)
|
|
|
extend_table = pd.DataFrame(extend_field_list)
|
|
|
extend_table.to_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t', index=False)
|
|
|
|
|
|
|
|
|
def step_1_3():
|
|
|
"""
|
|
|
对引文的doi补充完整记录
|
|
|
1.3.对有doi的引文进行完整记录匹配
|
|
|
1.4.没有doi的发文要保留解析的其他信息
|
|
|
"""
|
|
|
doi_expand = pd.read_csv(os.path.join(ROOT_PATH, 'doi展开.csv'))
|
|
|
doi_full_record = pd.read_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t')
|
|
|
doi_full_record.rename(columns=TABLE_HEAD_TRANS, inplace=True)
|
|
|
# 匹配前统一转小写,防止有匹配不上的
|
|
|
doi_expand['doi_backup'] = doi_expand['doi'].str.lower()
|
|
|
doi_full_record['DI'] = doi_full_record['DI'].str.lower()
|
|
|
|
|
|
# 用doi匹配到完整字段 # doi_expand[doi_expand['third_id']=='WOS:000426769900009']
|
|
|
ref_table = doi_expand.merge(right=doi_full_record, how='left', left_on=['doi_backup'], right_on=['DI'])
|
|
|
|
|
|
# 表头重命名为规定的名字
|
|
|
ref_table.rename(
|
|
|
columns={'au': '作者', 'py': '年份', 'so': '刊名-简称', 'SO': 'citedTitle', 'EI': 'online issn', 'BN': 'isbn'},
|
|
|
inplace=True
|
|
|
)
|
|
|
ref_table['citedWork'] = None
|
|
|
ref_table['hot'] = None
|
|
|
# 保留需要使用的列
|
|
|
ref_table = ref_table[
|
|
|
['third_id', '作者', '年份', '刊名-简称', 'doi', 'citedTitle', 'citedWork', 'hot', 'DT', 'UT', 'AB', 'SN', 'online issn',
|
|
|
'isbn']]
|
|
|
|
|
|
ref_table.to_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t',
|
|
|
index=False)
|
|
|
|
|
|
|
|
|
def step_2_1(table: pd.DataFrame, ai_result_table):
|
|
|
"""
|
|
|
处理主表的数据
|
|
|
2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者
|
|
|
"""
|
|
|
# table = pd.read_excel(os.path.join(ROOT_PATH, '原始数据合并\法学.xlsx'), sheet_name=0)
|
|
|
|
|
|
# 改原始数据的列名
|
|
|
table = table[['UT', 'TC', 'PY', 'IS', 'VL', 'CR']] # 字段有疑问
|
|
|
# 匹配识别到的学者和学校
|
|
|
au_table = table.merge(right=ai_result_table, how='left', left_on=['UT'], right_on=['文献编号'])
|
|
|
|
|
|
# 补充列
|
|
|
au_table['uid'] = None
|
|
|
au_table['docid'] = None
|
|
|
au_table['articleId'] = None
|
|
|
au_table['uid'] = None
|
|
|
# 修改基础表名
|
|
|
au_table.rename(
|
|
|
columns={'UT': 'wos', '本校的作者': 'citedAuthor', 'TC': 'timesCited', 'PY': 'year', 'IS': 'page', 'VL': 'volume', 'CR': '参考文献'},
|
|
|
inplace=True
|
|
|
)
|
|
|
# 保留需要的列
|
|
|
au_table = au_table[['wos', 'uid', 'docid', 'articleId', '所属学校', 'citedAuthor', 'timesCited', 'year', 'page', 'volume', '参考文献']]
|
|
|
return au_table
|
|
|
|
|
|
|
|
|
def main_step1():
|
|
|
step_1_1()
|
|
|
# step_1_2()
|
|
|
# step_1_3() # WOS:000426769900009
|
|
|
|
|
|
|
|
|
def main_step2():
|
|
|
ai_result_table = pd.read_csv(os.path.join(ROOT_PATH, '学校地址信息1.8.csv'))
|
|
|
reference_table = pd.read_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t')
|
|
|
# 获取到所有的学科表
|
|
|
subject_table_path = os.path.join(ROOT_PATH, '原始数据合并')
|
|
|
# subject_table_list = os.listdir(subject_table_path)
|
|
|
subject_table_list = ['法学.xlsx']
|
|
|
for subject_table in subject_table_list:
|
|
|
print('当前处理 %s' % subject_table)
|
|
|
table = pd.read_excel(os.path.join(subject_table_path, subject_table), sheet_name=0)
|
|
|
table = step_2_1(table, ai_result_table)
|
|
|
full_table = step_2_2(table, ref_table=reference_table)
|
|
|
print(full_table)
|
|
|
full_table.drop(columns=['third_id'], inplace=True)
|
|
|
full_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\%s' % subject_table), index=False)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main_step1()
|
|
|
# main_step2()
|