You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

255 lines
9.4 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2025/1/8 14:28
# @Author : zhaoxiangpeng
# @File : 20250108_func.py
# 1.引文表构建
# 1.1.解析发文表的引文字段
# 1.2.下载有doi的引文的完整记录
# 1.3.对有doi的引文进行完整记录匹配
# 1.4.没有doi的发文要保留解析的其他信息 CR字段扩展.csv
# 1.5.需要保留发文的唯一id用来匹配 标准doi扩展完整字段.csv
# 2.发文表构建
# 2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者
# 2.2.发文信息的结果与引文信息的结果进行匹配
import os
import re
import pandas as pd
from pymongo import MongoClient
READ_PATH = 'Y:\\wos-metadata\\SCHOOL\\bnu'
ROOT_PATH = 'Y:\数据采集需求\陶思琪20211203'
DOI_SPLIT_SYMBOL = 'DOI '
DOI_SPLIT_SYMBOL_LENGTH = len(DOI_SPLIT_SYMBOL)
INFO_SPLIT_SYMBOL = ', '
INFO_SPLIT_SYMBOL_LENGTH = len(INFO_SPLIT_SYMBOL)
# 引文表需要有一个UT字段用来作为主键
REF_RECORD_TABLE_FIELD = ['UT', 'doi']
# 完整记录要保留的字段, {'exported.%s' % key.lower() : 1 for key in FULL_RECORD_TABLE_FIELD}
FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'DT', 'UT', 'AB', 'SN', 'EI', 'BN', 'PY']
TABLE_HEAD_TRANS = {k.lower(): k for k in FULL_RECORD_TABLE_FIELD}
MONGODB_REMOTE_CONFIG = dict(
host='101.43.239.105',
port=27017,
db='science2',
username='science-dev',
password='kcidea1509!%25)(',
url='mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true'
)
client: MongoClient = MongoClient(MONGODB_REMOTE_CONFIG.get("url"))
database = client.get_database(MONGODB_REMOTE_CONFIG.get('db'))
def find_doi_data_from_mongo(doi_list: list):
collection = database.get_collection('data_wos_article')
find_results = collection.find(
filter={"exported.di": {"$in": doi_list}},
projection={"_id": 0, "third_id": 1, 'exported.ab': 1, 'exported.bn': 1, 'exported.di': 1, 'exported.ei': 1,
'exported.is': 1, 'exported.dt': 1, 'exported.sn': 1, 'exported.so': 1}
).collation({"locale": "en", "strength": 2}) # 忽略大小写
for document in find_results:
exported: dict = document.get('exported')
third_id = document.get('third_id')
exported.setdefault('ut', third_id)
yield exported
def step_2_2(base_table: pd.DataFrame, ref_table: pd.DataFrame):
"""
2.2.发文信息的结果与引文信息的结果进行匹配
"""
out_table = base_table.merge(right=ref_table, how='left', left_on=['wos'], right_on=['third_id'])
return out_table
def verify_doi(text) -> str:
doi_idx = text.find(DOI_SPLIT_SYMBOL)
if doi_idx != -1:
doi_str = text[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
else:
doi_str = text
if doi_str.endswith(')'):
doi_str = doi_str[:-1]
return doi_str
def ref_str2dic(text):
ref_list = text.split('; ')
for ref in ref_list:
# print(ref)
"""
"""
# 解析引文字段的信息
ref_copy = ref
var1 = []
for _ in range(3):
idx_t = ref_copy.find(INFO_SPLIT_SYMBOL)
var1.append(ref_copy[:idx_t])
ref_copy = ref_copy[idx_t + INFO_SPLIT_SYMBOL_LENGTH:]
au, py, so = var1
# 年份要判断要满足4位数字
if not re.match(r'\d{4}', py):
py = None
doi_idx = ref.find(DOI_SPLIT_SYMBOL)
# 把参考文献字段也加进去
model = dict(au=au, py=py, so=so, ref=ref)
if doi_idx != -1:
doi_text = ref[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
if doi_text.startswith('['):
doi_mutil_text = doi_text[1:-1]
doi_list = doi_mutil_text.split(', ')
for doi_str in doi_list:
if doi := verify_doi(doi_str):
obj = dict(doi=doi)
obj.update(**model)
yield obj
else:
if doi := verify_doi(doi_text):
obj = dict(doi=doi)
obj.update(**model)
yield obj
else:
obj = dict(doi=None)
obj.update(**model)
yield obj
def step_1_1():
"""
1.1.解析发文表的引文字段
"""
school_list = os.listdir(READ_PATH)
for ff in school_list:
if ff == 'doi展开' or ff == '待下载DOI':
continue
clear = []
school_path = os.path.join(READ_PATH, ff)
file_list = os.listdir(school_path)
for file in file_list:
f = os.path.join(school_path, file)
table = pd.read_csv(f, sep='\t', error_bad_lines=False)
table = table[['UT', "CR"]]
values = table.values.tolist()
for value in values:
third_id, article_reference_text = value
if pd.isna(article_reference_text):
continue
_g = ref_str2dic(article_reference_text)
for r_doi in _g:
r_doi.setdefault('third_id', third_id)
clear.append(r_doi)
table2 = pd.DataFrame(clear)
table2.to_csv(os.path.join(os.path.join('Y:\数据采集需求\陶思琪20211203', 'doi展开'),
'%s.csv' % os.path.basename(school_path)),
index=False)
def step_1_2():
"""
1.2.下载有doi的引文的完整记录
"""
BATCH_DOWNLOAD_LIMIT = 500
doi_table = pd.read_csv(os.path.join(ROOT_PATH, '待下载DOI.csv'))
extend_field_list = []
tasks = doi_table['DO'].values.tolist()
for i in range(0, len(tasks), BATCH_DOWNLOAD_LIMIT):
batch_tasks = tasks[i: i + BATCH_DOWNLOAD_LIMIT]
g = find_doi_data_from_mongo(doi_list=batch_tasks)
for doc in g:
extend_field_list.append(doc)
extend_table = pd.DataFrame(extend_field_list)
extend_table.to_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t', index=False)
def step_1_3():
"""
对引文的doi补充完整记录
1.3.对有doi的引文进行完整记录匹配
1.4.没有doi的发文要保留解析的其他信息
"""
doi_expand = pd.read_csv(os.path.join(ROOT_PATH, 'doi展开.csv'))
doi_full_record = pd.read_csv(os.path.join(ROOT_PATH, 'CR字段扩展.csv'), sep='\t')
doi_full_record.rename(columns=TABLE_HEAD_TRANS, inplace=True)
# 匹配前统一转小写,防止有匹配不上的
doi_expand['doi_backup'] = doi_expand['doi'].str.lower()
doi_full_record['DI'] = doi_full_record['DI'].str.lower()
# 用doi匹配到完整字段 # doi_expand[doi_expand['third_id']=='WOS:000426769900009']
ref_table = doi_expand.merge(right=doi_full_record, how='left', left_on=['doi_backup'], right_on=['DI'])
# 表头重命名为规定的名字
ref_table.rename(
columns={'au': '作者', 'py': '年份', 'so': '刊名-简称', 'SO': 'citedTitle', 'EI': 'online issn', 'BN': 'isbn'},
inplace=True
)
ref_table['citedWork'] = None
ref_table['hot'] = None
# 保留需要使用的列
ref_table = ref_table[
['third_id', '作者', '年份', '刊名-简称', 'doi', 'citedTitle', 'citedWork', 'hot', 'DT', 'UT', 'AB', 'SN', 'online issn',
'isbn']]
ref_table.to_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t',
index=False)
def step_2_1(table: pd.DataFrame, ai_result_table):
"""
处理主表的数据
2.1.对不同的学科与学校发文进行匹配,获取到发文的学校、学者
"""
# table = pd.read_excel(os.path.join(ROOT_PATH, '原始数据合并\法学.xlsx'), sheet_name=0)
# 改原始数据的列名
table = table[['UT', 'TC', 'PY', 'IS', 'VL', 'CR']] # 字段有疑问
# 匹配识别到的学者和学校
au_table = table.merge(right=ai_result_table, how='left', left_on=['UT'], right_on=['文献编号'])
# 补充列
au_table['uid'] = None
au_table['docid'] = None
au_table['articleId'] = None
au_table['uid'] = None
# 修改基础表名
au_table.rename(
columns={'UT': 'wos', '本校的作者': 'citedAuthor', 'TC': 'timesCited', 'PY': 'year', 'IS': 'page', 'VL': 'volume', 'CR': '参考文献'},
inplace=True
)
# 保留需要的列
au_table = au_table[['wos', 'uid', 'docid', 'articleId', '所属学校', 'citedAuthor', 'timesCited', 'year', 'page', 'volume', '参考文献']]
return au_table
def main_step1():
step_1_1()
# step_1_2()
# step_1_3() # WOS:000426769900009
def main_step2():
ai_result_table = pd.read_csv(os.path.join(ROOT_PATH, '学校地址信息1.8.csv'))
reference_table = pd.read_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t')
# 获取到所有的学科表
subject_table_path = os.path.join(ROOT_PATH, '原始数据合并')
# subject_table_list = os.listdir(subject_table_path)
subject_table_list = ['法学.xlsx']
for subject_table in subject_table_list:
print('当前处理 %s' % subject_table)
table = pd.read_excel(os.path.join(subject_table_path, subject_table), sheet_name=0)
table = step_2_1(table, ai_result_table)
full_table = step_2_2(table, ref_table=reference_table)
print(full_table)
full_table.drop(columns=['third_id'], inplace=True)
full_table.to_excel(os.path.join(ROOT_PATH, 'RESULT\%s' % subject_table), index=False)
if __name__ == '__main__':
main_step1()
# main_step2()