You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
49 lines
1.9 KiB
Python
49 lines
1.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2024/3/19 15:18
|
|
# @Author : zhaoxiangpeng
|
|
# @File : get_lose_year.py
|
|
|
|
import os
|
|
import re
|
|
|
|
from loguru import logger
|
|
|
|
ROOT_PATH = "Y:\wos-metadata\issn-data"
|
|
|
|
collection = dict()
|
|
has_year_collection = set()
|
|
|
|
# 获取主路径下的所有ISSN文件夹
|
|
master_dirs = os.listdir(ROOT_PATH)
|
|
for master_dir in master_dirs:
|
|
logger.debug('检测路径: %s' % master_dir)
|
|
# 一个ISSN的主文件下理应是以数字命名的文件或以IS=(xx)命名的文件夹
|
|
issn_file_path = os.path.join(ROOT_PATH, master_dir)
|
|
child_dirs = os.listdir(issn_file_path)
|
|
single_year_list = []
|
|
for child_dir_name in child_dirs:
|
|
# 判断是否是文件夹
|
|
child_dir = os.path.join(issn_file_path, child_dir_name)
|
|
if not os.path.isdir(child_dir):
|
|
# logger.debug('检测 %s 不是一个文件夹, 跳过' % child_dir_name)
|
|
continue
|
|
else:
|
|
# 如果是文件夹,获取文件夹中的年份信息
|
|
results = re.findall(r'={1,2}\((\d{4})\)', child_dir_name)
|
|
if results:
|
|
single_year_list.append(int(results[0]))
|
|
has_year_collection.add(master_dir)
|
|
# 把单个年份的列表排序取最大最小值,计算应该有的年份数量,进行对比
|
|
if not single_year_list:
|
|
continue
|
|
single_year_list = sorted(single_year_list)
|
|
must_year_list = list(range(single_year_list[0], (single_year_list[-1]+1)))
|
|
if len(single_year_list) < len(must_year_list):
|
|
lose_year_list = set(must_year_list) - set(single_year_list)
|
|
lose_year_list = list(lose_year_list)
|
|
logger.warning("%s 有年份缺失, 缺失的年份有: %s" % (master_dir, list(lose_year_list)))
|
|
collection[master_dir] = lose_year_list
|
|
|
|
logger.warning('有年份的文件夹: %s' % has_year_collection)
|
|
logger.warning('缺失年份的文件夹: %s' % collection)
|