data_process/doi_parse/inspec2redis.py

# -*- coding: utf-8 -*-
# @Time    : 2022/5/31 10:02
# @Author  : ZhaoXiangPeng
# @File    : inspec2redis.py

from ReSpider.db.redisdb import RedisDB
from doi_parse.gz import un_gz
from queue import Queue
import pandas as pd
import os
import time
import json
from concurrent.futures import ThreadPoolExecutor


class Counter:
    def __init__(self, clint=None):
        self.client: RedisDB = clint
        if clint is None:
            self.client = RedisDB(db=2)

    def incr(self, key: list, counter: list):
        """
        key: 可以作为刊唯一id的 redis key
        counter:  可以作为文章唯一id的 doi 或 title
        """
        pipe = self.client._redis.pipeline()
        pipe.multi()
        for k in key:
            k = k.upper()
            if not self.sismember('inspec:journals', k):
                continue
            for value in counter:
                pipe.hincrby(k, value)
        pipe.execute()

    def decr(self, key: str, counter: str, amount=1):
        """计数累减"""
        return self.client.hincrby(key, counter, amount=-amount)

    def get_cnt(self, key: str, counter: str):
        """获取当前计数的值"""
        return self.client.hget(key, counter)

    def sismember(self, key, value):
        return self.client.sismember(key, value)


class InspecToRedis:
    """
    1. 加载文件
    2. 解压
    3. 解析
    4. 入库
    4.1 ISSN 关联
    4.2 ISSN1
            doi1 1
            doi2 1
        ISSN2
            doi1 2
            doi2 1
    """
    def __init__(self, counter=None, file_list=None, root: str = None):
        self.counter = counter  # 初始化计数器
        self.file_list: list = file_list
        print('初始化任务 %s 个' % len(file_list))
        # self.to_queue(file_list)
        self.root = root or 'H:/crossref_public_data_file_2021_01/'

    @staticmethod
    def load_gz(file_path):
        return os.listdir(file_path)

    # def to_queue(self, file_list):
    #     for file in file_list:
    #         self.file_list.put_nowait(file)
    #     print('*'*50, '注入队列完成', '*'*50)

    def inspec2redis(self):
        df = pd.read_csv('F:/工作数据存储2022/20220526_inspec测试/inspec期刊列表2.csv')
        issn_list = df['ISSN'].values.tolist()
        return self.counter.client.sadd('inspec:journals', issn_list)

    def get_next(self):
        item = self.file_list.pop()
        return item

    def compatible(self, file):
        if file[-2:] == 'gz':
            io = un_gz(self.root+file)
        else:
            io = open(self.root+file, encoding='utf-8')
        return io

    def data_parse(self, io):
        data = json.load(io)
        items = data['items']
        for item in items:
            doi = item.get('DOI')
            dtype = item.get('type')
            issn = item.get('ISSN')
            title = item.get('title') and item.get('title')[0]
            source = item.get('container-title') and item.get('container-title')[0]
            publisher = item.get('publisher')
            ref_count = item.get('reference-count', 0)
            """
            print('****************************************************\n'
                  'TITLE:    %s\n'
                  'DOI:      %s\n'
                  'TYPE:     %s\n'
                  'ISSN:     %s\n'
                  'SOURCE:   %s\n' % (title, doi, dtype, issn, source))"""
            """
            # mongodb存一份
            ser_item = {'TITLE': title, 'DOI': doi, 'TYPE': dtype, 'PUBLISHER': publisher,
                        'REFERENCE-COUNT': ref_count,
                        'SOURCE': source, 'ISSN': issn}
            """
            if not ref_count:
                continue
            try:
                reference_list = []
                for reference in item.get('reference', []):
                    ref_doi = reference.get('DOI')
                    if ref_doi:
                        # do something
                        # print(ref_doi)
                        reference_list.append(ref_doi)
                        continue
                    ref_at = reference.get('article-title')
                    if ref_at:
                        # print(ref_at)
                        # reference_list.append(ref_at)
                        continue
                    ref_jt = reference.get('journal-title')
            except KeyError:
                print(item.keys())
            else:
                self.counter.incr(issn, reference_list)

    def pro(self, file=None):
        # print('剩余任务 %s 个' % self.file_list.__len__())
        if not file and self.file_list.__len__():
            file = self.get_next()
        if not file:
            return
        print('******************** 当前处理文件 %s ********************' % file)

        io = self.compatible(file)
        self.data_parse(io)
        io.close()

    def batch(self):
        with ThreadPoolExecutor(max_workers=2) as executor:
            executor.map(self.pro, self.file_list)

    def start(self):
        index_count = 0
        while len(self.file_list):
            index_count += 1
            print('当前处理第 %s 个' % index_count)
            self.pro()


if __name__ == '__main__':
    count = Counter()
    # files = InspecToRedis.load_gz('H:/crossref_public_data_file_2021_01')
    # files = files[40000:]
    files = ['0.json']
    i2r = InspecToRedis(counter=count, file_list=files)
    s = time.time()
    i2r.batch()
    # i2r.inspec2redis()
    print('耗时 %s 秒' % (time.time()-s))