|
|
# -*- coding: utf-8 -*-
|
|
|
# @Time : 2023/5/4 16:33
|
|
|
# @Author : zhaoxiangpeng
|
|
|
# @File : extract_score.py
|
|
|
# 2023年6月6日09:52:59修改首先要对 进行聚合,分为多个表,计算分数
|
|
|
|
|
|
import os
|
|
|
import pandas as pd
|
|
|
from typing import Union
|
|
|
import data_process_tool
|
|
|
from article_subject.utils import get_row_top_join_sub, get_row_top
|
|
|
|
|
|
get_subject = get_row_top_join_sub
|
|
|
|
|
|
|
|
|
class GroupScore:
|
|
|
def __init__(self, by_column, base_columns: list = None):
|
|
|
self.columns = base_columns
|
|
|
|
|
|
def get_score(self, table_or_file: Union[pd.DataFrame, str]) -> pd.DataFrame:
|
|
|
if isinstance(table_or_file, str):
|
|
|
table_or_file = data_process_tool.read_data(table_or_file)
|
|
|
assert isinstance(table_or_file, pd.DataFrame)
|
|
|
groups = table_or_file.groupby(by=[''])
|
|
|
|
|
|
|
|
|
class BaseExtractScore:
|
|
|
def __init__(self, table: Union[pd.DataFrame, str], base_columns: list = None, score_columns: list = None):
|
|
|
self._table = table
|
|
|
self._columns = base_columns
|
|
|
|
|
|
def process(self) -> pd.DataFrame:
|
|
|
if isinstance(self._table, str):
|
|
|
self._table = data_process_tool.read_data(self._table)
|
|
|
|
|
|
# 把基本信息列当做行索引,计算分数值
|
|
|
del_columns = set(self._table.columns) - set(self._columns)
|
|
|
self._table.set_index(keys=self._columns, inplace=True)
|
|
|
self._table['高分学科'] = self._table.apply(get_subject, axis=1)
|
|
|
self._table.reset_index(inplace=True)
|
|
|
|
|
|
self._table.drop(labels=list(del_columns), axis=1, inplace=True)
|
|
|
return self._table
|
|
|
|
|
|
|
|
|
def task1():
|
|
|
PATH = 'F:/工作数据存储2023/20230426_评分模型/'
|
|
|
file = os.path.join(PATH, '知识视界分类数据(已人工判断)_评分模型.csv')
|
|
|
ins = BaseExtractScore(
|
|
|
table=file,
|
|
|
base_columns=['Title', 'ABS', 'title_en', 'Title_ABS', 'cncole', '教育部学科门类', '教育部一级学科', '教育部二级学科', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确', 'Unnamed: 13']
|
|
|
)
|
|
|
ta = ins.process()
|
|
|
ta.to_csv(os.path.join(PATH, '知识视界分类数据(已人工判断)_评分模型-高分学科.csv'), index=False)
|
|
|
|
|
|
|
|
|
def task2():
|
|
|
PATH = 'F:/工作数据存储2023/20230426_评分模型/'
|
|
|
file = os.path.join(PATH, 'jove-中图分类号数据_评分模型.csv')
|
|
|
ins = BaseExtractScore(
|
|
|
table=file,
|
|
|
base_columns=['Genre - 655 indicatior 1 and 2 " 4"', 'Subjects - 650 indicator 1 and 2 " 4"', 'Additional Material Characteristics 006 “m o c “', 'Physical Description Fixed Field - 007 “cr unu”', 'Video ID 001', 'Video Name 245 $a indicator 1 and 2 "00"', '856 $3 inidcatior 1 and 2 "40"', 'Link-856 $u', 'Physical Decription-300 $a', 'Format-', 'Runnng Time-', 'Subtitle/caption language codes (as ISO 3-letter codes separated by semicolons)', 'Chapter Number-', 'Formatted Contents: Chapter-505 $a', 'Content Type 336 “$btdi $2rdacontent”', 'Material Type 337 "$bc $2rdamedia”', 'Carrier Type - 338 “$bcr$2rdacarrier"', 'Summary - 520 $a', 'Source of Description Note-588 $a', 'Date-260 $c', 'Publisher-260 $b', 'City-260 $a', 'Country Code-008', 'Language-008', 'Series-490', 'ISSN-022 $a', 'cnCode', '教育部学科门类', '教育部一级学科', '教育部二级学科', '未能识别的clc', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确']
|
|
|
)
|
|
|
ta = ins.process()
|
|
|
ta.to_csv(os.path.join(PATH, 'jove-中图分类号数据_评分模型-高分学科.csv'), index=False)
|
|
|
|
|
|
|
|
|
def task3():
|
|
|
PATH = 'Z:/数据处理流程/'
|
|
|
file = os.path.join(PATH, '0028-4793_评分模型.csv')
|
|
|
ins = BaseExtractScore(
|
|
|
table=file,
|
|
|
base_columns=['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT', 'cncode', '一级学科', '二级学科', '一级学科数目', '二级学科数目']
|
|
|
)
|
|
|
ta = ins.process()
|
|
|
ta.to_csv(os.path.join(PATH, '0028-4793-高分学科.csv'), index=False)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# task1()
|
|
|
# task2()
|
|
|
task3()
|