|
|
@ -29,7 +29,7 @@ INFO_SPLIT_SYMBOL_LENGTH = len(INFO_SPLIT_SYMBOL)
|
|
|
|
# 引文表需要有一个UT字段用来作为主键
|
|
|
|
# 引文表需要有一个UT字段用来作为主键
|
|
|
|
REF_RECORD_TABLE_FIELD = ['UT', 'doi']
|
|
|
|
REF_RECORD_TABLE_FIELD = ['UT', 'doi']
|
|
|
|
# 完整记录要保留的字段, {'exported.%s' % key.lower() : 1 for key in FULL_RECORD_TABLE_FIELD}
|
|
|
|
# 完整记录要保留的字段, {'exported.%s' % key.lower() : 1 for key in FULL_RECORD_TABLE_FIELD}
|
|
|
|
FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'PT', 'UT', 'AB', 'SN', 'EI', 'BN', 'PY']
|
|
|
|
FULL_RECORD_TABLE_FIELD = ['DI', 'SO', 'DT', 'UT', 'AB', 'SN', 'EI', 'BN', 'PY']
|
|
|
|
TABLE_HEAD_TRANS = {k.lower(): k for k in FULL_RECORD_TABLE_FIELD}
|
|
|
|
TABLE_HEAD_TRANS = {k.lower(): k for k in FULL_RECORD_TABLE_FIELD}
|
|
|
|
|
|
|
|
|
|
|
|
MONGODB_REMOTE_CONFIG = dict(
|
|
|
|
MONGODB_REMOTE_CONFIG = dict(
|
|
|
@ -50,7 +50,7 @@ def find_doi_data_from_mongo(doi_list: list):
|
|
|
|
find_results = collection.find(
|
|
|
|
find_results = collection.find(
|
|
|
|
filter={"exported.di": {"$in": doi_list}},
|
|
|
|
filter={"exported.di": {"$in": doi_list}},
|
|
|
|
projection={"_id": 0, "third_id": 1, 'exported.ab': 1, 'exported.bn': 1, 'exported.di': 1, 'exported.ei': 1,
|
|
|
|
projection={"_id": 0, "third_id": 1, 'exported.ab': 1, 'exported.bn': 1, 'exported.di': 1, 'exported.ei': 1,
|
|
|
|
'exported.is': 1, 'exported.pt': 1, 'exported.sn': 1, 'exported.so': 1}
|
|
|
|
'exported.is': 1, 'exported.dt': 1, 'exported.sn': 1, 'exported.so': 1}
|
|
|
|
).collation({"locale": "en", "strength": 2}) # 忽略大小写
|
|
|
|
).collation({"locale": "en", "strength": 2}) # 忽略大小写
|
|
|
|
for document in find_results:
|
|
|
|
for document in find_results:
|
|
|
|
exported: dict = document.get('exported')
|
|
|
|
exported: dict = document.get('exported')
|
|
|
@ -96,7 +96,8 @@ def ref_str2dic(text):
|
|
|
|
if not re.match(r'\d{4}', py):
|
|
|
|
if not re.match(r'\d{4}', py):
|
|
|
|
py = None
|
|
|
|
py = None
|
|
|
|
doi_idx = ref.find(DOI_SPLIT_SYMBOL)
|
|
|
|
doi_idx = ref.find(DOI_SPLIT_SYMBOL)
|
|
|
|
model = dict(au=au, py=py, so=so)
|
|
|
|
# 把参考文献字段也加进去
|
|
|
|
|
|
|
|
model = dict(au=au, py=py, so=so, ref=ref)
|
|
|
|
if doi_idx != -1:
|
|
|
|
if doi_idx != -1:
|
|
|
|
doi_text = ref[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
|
|
|
|
doi_text = ref[doi_idx + DOI_SPLIT_SYMBOL_LENGTH:]
|
|
|
|
if doi_text.startswith('['):
|
|
|
|
if doi_text.startswith('['):
|
|
|
@ -191,7 +192,7 @@ def step_1_3():
|
|
|
|
ref_table['hot'] = None
|
|
|
|
ref_table['hot'] = None
|
|
|
|
# 保留需要使用的列
|
|
|
|
# 保留需要使用的列
|
|
|
|
ref_table = ref_table[
|
|
|
|
ref_table = ref_table[
|
|
|
|
['third_id', '作者', '年份', '刊名-简称', 'doi', 'citedTitle', 'citedWork', 'hot', 'PT', 'UT', 'AB', 'SN', 'online issn',
|
|
|
|
['third_id', '作者', '年份', '刊名-简称', 'doi', 'citedTitle', 'citedWork', 'hot', 'DT', 'UT', 'AB', 'SN', 'online issn',
|
|
|
|
'isbn']]
|
|
|
|
'isbn']]
|
|
|
|
|
|
|
|
|
|
|
|
ref_table.to_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t',
|
|
|
|
ref_table.to_csv(os.path.join(ROOT_PATH, '标准doi扩展完整字段.csv'), sep='\t',
|
|
|
@ -226,9 +227,9 @@ def step_2_1(table: pd.DataFrame, ai_result_table):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main_step1():
|
|
|
|
def main_step1():
|
|
|
|
# step_1_1()
|
|
|
|
step_1_1()
|
|
|
|
# step_1_2()
|
|
|
|
# step_1_2()
|
|
|
|
step_1_3() # WOS:000426769900009
|
|
|
|
# step_1_3() # WOS:000426769900009
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main_step2():
|
|
|
|
def main_step2():
|
|
|
@ -249,5 +250,5 @@ def main_step2():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# main_step1()
|
|
|
|
main_step1()
|
|
|
|
main_step2()
|
|
|
|
# main_step2()
|
|
|
|