You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

83 lines
4.3 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @Time : 2023/5/4 16:33
# @Author : zhaoxiangpeng
# @File : extract_score.py
# 2023年6月6日09:52:59修改首先要对 进行聚合,分为多个表,计算分数
import os
import pandas as pd
from typing import Union
import data_process_tool
from article_subject.utils import get_row_top_join_sub, get_row_top
get_subject = get_row_top_join_sub
class GroupScore:
def __init__(self, by_column, base_columns: list = None):
self.columns = base_columns
def get_score(self, table_or_file: Union[pd.DataFrame, str]) -> pd.DataFrame:
if isinstance(table_or_file, str):
table_or_file = data_process_tool.read_data(table_or_file)
assert isinstance(table_or_file, pd.DataFrame)
groups = table_or_file.groupby(by=[''])
class BaseExtractScore:
def __init__(self, table: Union[pd.DataFrame, str], base_columns: list = None, score_columns: list = None):
self._table = table
self._columns = base_columns
def process(self) -> pd.DataFrame:
if isinstance(self._table, str):
self._table = data_process_tool.read_data(self._table)
# 把基本信息列当做行索引,计算分数值
del_columns = set(self._table.columns) - set(self._columns)
self._table.set_index(keys=self._columns, inplace=True)
self._table['高分学科'] = self._table.apply(get_subject, axis=1)
self._table.reset_index(inplace=True)
self._table.drop(labels=list(del_columns), axis=1, inplace=True)
return self._table
def task1():
PATH = 'F:/工作数据存储2023/20230426_评分模型/'
file = os.path.join(PATH, '知识视界分类数据已人工判断_评分模型.csv')
ins = BaseExtractScore(
table=file,
base_columns=['Title', 'ABS', 'title_en', 'Title_ABS', 'cncole', '教育部学科门类', '教育部一级学科', '教育部二级学科', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确', 'Unnamed: 13']
)
ta = ins.process()
ta.to_csv(os.path.join(PATH, '知识视界分类数据已人工判断_评分模型-高分学科.csv'), index=False)
def task2():
PATH = 'F:/工作数据存储2023/20230426_评分模型/'
file = os.path.join(PATH, 'jove-中图分类号数据_评分模型.csv')
ins = BaseExtractScore(
table=file,
base_columns=['Genre - 655 indicatior 1 and 2 " 4"', 'Subjects - 650 indicator 1 and 2 " 4"', 'Additional Material Characteristics 006 “m o c “', 'Physical Description Fixed Field - 007 “cr unu”', 'Video ID 001', 'Video Name 245 $a indicator 1 and 2 "00"', '856 $3 inidcatior 1 and 2 "40"', 'Link-856 $u', 'Physical Decription-300 $a', 'Format-', 'Runnng Time-', 'Subtitle/caption language codes (as ISO 3-letter codes separated by semicolons)', 'Chapter Number-', 'Formatted Contents: Chapter-505 $a', 'Content Type 336 “$btdi $2rdacontent”', 'Material Type 337 "$bc $2rdamedia”', 'Carrier Type - 338 “$bcr$2rdacarrier"', 'Summary - 520 $a', 'Source of Description Note-588 $a', 'Date-260 $c', 'Publisher-260 $b', 'City-260 $a', 'Country Code-008', 'Language-008', 'Series-490', 'ISSN-022 $a', 'cnCode', '教育部学科门类', '教育部一级学科', '教育部二级学科', '未能识别的clc', '非常准确', '比较准确', '基本准确', '比较不准确', '非常不准确']
)
ta = ins.process()
ta.to_csv(os.path.join(PATH, 'jove-中图分类号数据_评分模型-高分学科.csv'), index=False)
def task3():
PATH = 'Z:/数据处理流程/'
file = os.path.join(PATH, '0028-4793_评分模型.csv')
ins = BaseExtractScore(
table=file,
base_columns=['PT', 'AU', 'BA', 'BE', 'GP', 'AF', 'BF', 'CA', 'TI', 'SO', 'SE', 'BS', 'LA', 'DT', 'CT', 'CY', 'CL', 'SP', 'HO', 'DE', 'ID', 'AB', 'C1', 'C3', 'RP', 'EM', 'RI', 'OI', 'FU', 'FP', 'FX', 'CR', 'NR', 'TC', 'Z9', 'U1', 'U2', 'PU', 'PI', 'PA', 'SN', 'EI', 'BN', 'J9', 'JI', 'PD', 'PY', 'VL', 'IS', 'PN', 'SU', 'SI', 'MA', 'BP', 'EP', 'AR', 'DI', 'DL', 'D2', 'EA', 'PG', 'WC', 'WE', 'SC', 'GA', 'PM', 'OA', 'HC', 'HP', 'DA', 'UT', 'cncode', '一级学科', '二级学科', '一级学科数目', '二级学科数目']
)
ta = ins.process()
ta.to_csv(os.path.join(PATH, '0028-4793-高分学科.csv'), index=False)
if __name__ == '__main__':
# task1()
# task2()
task3()