You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
1.9 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2024/3/19 15:18
# @Author : zhaoxiangpeng
# @File : get_lose_year.py
import os
import re
from loguru import logger
ROOT_PATH = "Y:\wos-metadata\issn-data"
collection = dict()
has_year_collection = set()
# 获取主路径下的所有ISSN文件夹
master_dirs = os.listdir(ROOT_PATH)
for master_dir in master_dirs:
logger.debug('检测路径: %s' % master_dir)
# 一个ISSN的主文件下理应是以数字命名的文件或以IS=(xx)命名的文件夹
issn_file_path = os.path.join(ROOT_PATH, master_dir)
child_dirs = os.listdir(issn_file_path)
single_year_list = []
for child_dir_name in child_dirs:
# 判断是否是文件夹
child_dir = os.path.join(issn_file_path, child_dir_name)
if not os.path.isdir(child_dir):
# logger.debug('检测 %s 不是一个文件夹, 跳过' % child_dir_name)
continue
else:
# 如果是文件夹,获取文件夹中的年份信息
results = re.findall(r'={1,2}\((\d{4})\)', child_dir_name)
if results:
single_year_list.append(int(results[0]))
has_year_collection.add(master_dir)
# 把单个年份的列表排序取最大最小值,计算应该有的年份数量,进行对比
if not single_year_list:
continue
single_year_list = sorted(single_year_list)
must_year_list = list(range(single_year_list[0], (single_year_list[-1]+1)))
if len(single_year_list) < len(must_year_list):
lose_year_list = set(must_year_list) - set(single_year_list)
lose_year_list = list(lose_year_list)
logger.warning("%s 有年份缺失, 缺失的年份有: %s" % (master_dir, list(lose_year_list)))
collection[master_dir] = lose_year_list
logger.warning('有年份的文件夹: %s' % has_year_collection)
logger.warning('缺失年份的文件夹: %s' % collection)