You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

41 lines
1.2 KiB
Python

# -*- coding: utf-8 -*-
# @Time : 2022/6/6 10:26
# @Author : ZhaoXiangPeng
# @File : 合并统计.py
import pandas as pd
from 数据处理.文件合并 import join, load_file
path = 'E:/inspec合并/'
files = load_file(path)
# files = [f'{path}{f}' for f in files]
print(files)
def count(file_name: str):
temp_df: pd.DataFrame = pd.read_csv(file_name)
# group: pd.Series = temp_df.groupby(by=['doi'])['count'].sum()
# return pd.DataFrame(data=group)
return temp_df
df0 = pd.read_csv(r'F:\工作数据存储2022\20220526_inspec测试\Digital twin searching result 2.csv')
df0 = df0[['DI']]
df0.drop_duplicates(inplace=True)
big_df = pd.DataFrame()
for f in files:
t_df = count(path+f)
# 重设索引 以doi链接获取count
t_df = t_df.reset_index()
ts = pd.merge(df0, t_df, how='left', left_on=['DI'], right_on=['doi'])
ts = ts[ts['count'].notnull()]
big_df = pd.concat([big_df, ts], ignore_index=True)
pp = []
group_2 = big_df.groupby(by=['doi'])
for _, g in group_2:
row = {'doi': _, 'count': g['count'].sum(), 'issn': '; '.join(list(set(g['issn'])))}
pp.append(row)
df = pd.DataFrame(data=pp)
df.to_csv('inspec数据库施引测试数据.csv', index=False)