cssci spider
parent
3e3d237388
commit
21963b7f80
@ -0,0 +1,32 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2026/1/19 16:51
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : cssci.py
|
||||
# 数据来源名
|
||||
SOURCE_NAME = 'cssci'
|
||||
|
||||
|
||||
# api配置
|
||||
CSSCI_CONTROL_API = 'http://cssci.nju.edu.cn/control/controllers.php'
|
||||
# 搜索列表的接口
|
||||
CSSCI_SEARCH_API = CSSCI_CONTROL_API
|
||||
# 详情页接口
|
||||
CSSCI_ARTICLE_DETAIL_API = CSSCI_CONTROL_API
|
||||
|
||||
# 请求headers配置
|
||||
POST_HEADERS_CONFIG = {
|
||||
'content-type': 'application/x-www-form-urlencoded',
|
||||
'host': 'cssci.nju.edu.cn',
|
||||
'origin': 'http://cssci.nju.edu.cn',
|
||||
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
# 任务条件
|
||||
TASK_CONDITION_CONFIG = dict(
|
||||
task_table='task_search_strategy',
|
||||
task_field=["id", "content", "param"],
|
||||
task_condition="source_type = 8",
|
||||
db_type="mysql",
|
||||
batch_limit=1
|
||||
)
|
||||
@ -0,0 +1,50 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2026/1/16 14:11
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : kafka.py
|
||||
import time
|
||||
import logging
|
||||
from aiokafka import AIOKafkaProducer
|
||||
from kafka.errors import KafkaError
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KafkaUtil:
|
||||
MAX_RETRIES = 3
|
||||
BASE_DELAY = 2 # 秒:指数退避基础
|
||||
|
||||
def __init__(self, producer):
|
||||
self._kafka_producer: AIOKafkaProducer = producer
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
async def producer_send(self, data, topic):
|
||||
try:
|
||||
future = await self._kafka_producer.send(topic, value=data)
|
||||
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
|
||||
future.add_errback(self.on_send_error_factory(data, topic))
|
||||
except KafkaError as e:
|
||||
self.logger.error(f'{data.get("id", "")} - 生产失败\n'
|
||||
f'失败原因:{e}')
|
||||
|
||||
def on_send_success(self, record_metadata, data):
|
||||
msg_id = data.get("id", "unknown_id")
|
||||
self.logger.info(
|
||||
f"{msg_id} - 成功发送到: "
|
||||
f"topic={record_metadata.topic} - partition={record_metadata.partition} - offset={record_metadata.offset}")
|
||||
|
||||
def on_send_error_factory(self, data, topic, retries=0):
|
||||
msg_id = data.get("id", "unknown_id")
|
||||
|
||||
def on_send_error(exc):
|
||||
self.logger.info(f"{msg_id} - 第 {retries + 1} 次发送失败: {exc}")
|
||||
if retries < self.MAX_RETRIES:
|
||||
delay = self.BASE_DELAY * (2 ** retries)
|
||||
self.logger.info(f"{msg_id} - {delay}s 后重试,第 {retries + 1}/{self.MAX_RETRIES} 次")
|
||||
time.sleep(delay)
|
||||
future = self._kafka_producer.send(topic, value=data)
|
||||
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
|
||||
future.add_errback(self.on_send_error_factory(data, topic, retries + 1))
|
||||
else:
|
||||
self.logger.error(f"{msg_id} - 超过最大重试次数, 请检查")
|
||||
|
||||
return on_send_error
|
||||
@ -0,0 +1,34 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScienceArticleCssciItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
|
||||
|
||||
class AddItemBase(scrapy.Item):
|
||||
third_id = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleItem(AddItemBase):
|
||||
exported = scrapy.Field()
|
||||
|
||||
|
||||
class ArticleCitedItem(AddItemBase):
|
||||
cited = scrapy.Field()
|
||||
|
||||
|
||||
class CssciArticleItem(ArticleItem):
|
||||
__tablename__ = 'data_cssci_article'
|
||||
|
||||
third_id = scrapy.Field()
|
||||
resp_raw = scrapy.Field()
|
||||
detailed = scrapy.Field()
|
||||
updated_at = scrapy.Field()
|
||||
@ -0,0 +1,143 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
import logging
|
||||
from typing import Optional
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
from scrapy.http.headers import Headers
|
||||
|
||||
from science_article_cssci.scripts.get_cookie import GetSessionID
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ScienceArticleCssciSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
async def process_start(self, start):
|
||||
# Called with an async iterator over the spider start() method or the
|
||||
# maching method of an earlier spider middleware.
|
||||
async for item_or_request in start:
|
||||
yield item_or_request
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class ScienceArticleCssciDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class CssciCookieMiddleware:
|
||||
ss: Optional[GetSessionID]
|
||||
|
||||
def __init__(self, custom_headers: dict, cookie_cfg: dict):
|
||||
self.custom_headers = custom_headers
|
||||
self.headers = Headers(self.custom_headers)
|
||||
self.cookies_pool_config = cookie_cfg
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
settings = crawler.settings
|
||||
post_headers = crawler.settings.getdict('POST_HEADERS_CONFIG')
|
||||
s = cls(
|
||||
custom_headers=post_headers,
|
||||
cookie_cfg=dict(
|
||||
redis_uri=settings.get("COOKIE_POOL_CONFIG"),
|
||||
pool_key=settings.get("COOKIE_POOL_REDIS_KEY"),
|
||||
ttl=settings.get("COOKIE_REDIS_TTL")
|
||||
)
|
||||
)
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def spider_opened(self, spider):
|
||||
self.ss = GetSessionID(**self.cookies_pool_config)
|
||||
|
||||
async def process_request(self, request, spider):
|
||||
cookie_1 = await self.ss.get_cookie_from_redis()
|
||||
if not cookie_1:
|
||||
cookie_1 = await self.ss.get_cookie_to_redis()
|
||||
logger.info("""
|
||||
没有可用cookie
|
||||
重新获取: %s""" % cookie_1)
|
||||
|
||||
request.cookies = cookie_1
|
||||
request.headers = self.headers
|
||||
@ -0,0 +1,62 @@
|
||||
[source_type]
|
||||
1 = 论文
|
||||
2 = 综述
|
||||
3 = 评论
|
||||
5 = 报告
|
||||
4 = 传记资料
|
||||
9 = 其他
|
||||
|
||||
[source_jj]
|
||||
1 = 国家自科基金
|
||||
2 = 国家社科基金
|
||||
3 = 国家级其它基金
|
||||
4 = 教育部基金
|
||||
5 = 其他部委级基金
|
||||
6 = 中科院基金
|
||||
7 = 社科院基金
|
||||
8 = 省(市)级基金
|
||||
9 = 其它基金
|
||||
|
||||
[yw_type]
|
||||
1 = 期刊论文
|
||||
11 = 电子文献
|
||||
10 = 法规
|
||||
9 = 标准
|
||||
8 = 报告
|
||||
7 = 汇编
|
||||
6 = 信件
|
||||
5 = 学位论文
|
||||
4 = 会议文献
|
||||
3 = 报纸
|
||||
2 = 图书
|
||||
99 = 其他
|
||||
|
||||
[source_xk]
|
||||
630 = 管理学
|
||||
850 = 民族学
|
||||
860 = 新闻学与传播学
|
||||
870 = 图书馆、情报与文献学
|
||||
880 = 教育学
|
||||
890 = 体育学
|
||||
910 = 统计学
|
||||
920 = 心理学
|
||||
930 = 社会科学总论
|
||||
940 = 军事学
|
||||
950 = 文化学
|
||||
960 = 人文、经济地理
|
||||
970 = 环境科学
|
||||
840 = 社会学
|
||||
820 = 法学
|
||||
710 = 马克思主义
|
||||
720 = 哲学
|
||||
730 = 宗教学
|
||||
740 = 语言学
|
||||
009 = 文学
|
||||
751 = 外国文学
|
||||
752 = 中国文学
|
||||
760 = 艺术学
|
||||
770 = 历史学
|
||||
780 = 考古学
|
||||
790 = 经济学
|
||||
810 = 政治学
|
||||
999 = 其他学科
|
||||
@ -0,0 +1,106 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2024/11/12 16:10
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : get_cookie.py
|
||||
# 用来获取session_id
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from random import random
|
||||
|
||||
import redis
|
||||
import aiohttp
|
||||
|
||||
|
||||
class GetSessionID:
|
||||
__redis_cli = None
|
||||
|
||||
def __init__(self, redis_uri: str, pool_key: str, ttl: int = None, **kwargs):
|
||||
self.redis_uri = redis_uri
|
||||
self.pool_key = pool_key
|
||||
self.ttl = ttl
|
||||
|
||||
@staticmethod
|
||||
async def new_session_id() -> dict:
|
||||
session = aiohttp.ClientSession()
|
||||
resp = await session.get(
|
||||
'http://cssci.nju.edu.cn/index.html',
|
||||
headers={
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
|
||||
'host': 'cssci.nju.edu.cn'
|
||||
}
|
||||
)
|
||||
assert resp.status == 200
|
||||
resp = await session.post(
|
||||
'http://cssci.nju.edu.cn/control/controllers.php',
|
||||
headers={
|
||||
'content-type': 'application/x-www-form-urlencoded',
|
||||
'host': 'cssci.nju.edu.cn',
|
||||
'origin': 'http://cssci.nju.edu.cn',
|
||||
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
|
||||
},
|
||||
data=dict(control='user_control', action='check_user_online', rand=random()),
|
||||
)
|
||||
assert resp.status == 200
|
||||
# 把cookiejar转为dict并返回
|
||||
cookie_obj = dict()
|
||||
cookie_jar = session.cookie_jar.filter_cookies(resp.url)
|
||||
for key, jar in cookie_jar.items():
|
||||
cookie_obj.setdefault(jar.key, jar.value)
|
||||
# 关闭session
|
||||
await session.close()
|
||||
return cookie_obj
|
||||
|
||||
@property
|
||||
def redis_cli(self):
|
||||
if self.__class__.__redis_cli is None:
|
||||
self.__class__.__redis_cli = redis.asyncio.Redis.from_url(
|
||||
self.redis_uri,
|
||||
decode_responses=True
|
||||
)
|
||||
return self.__class__.__redis_cli
|
||||
|
||||
async def set_cookie_to_redis(self, val):
|
||||
result = await self.redis_cli.setex(self.pool_key, time=self.ttl, value=val)
|
||||
return result
|
||||
|
||||
async def get_cookie_from_redis(self, to_dict=True):
|
||||
"""
|
||||
:param to_dict: 是否从字符串转为dict
|
||||
:return:
|
||||
"""
|
||||
cookie_str = await self.redis_cli.get(self.pool_key)
|
||||
if not cookie_str:
|
||||
return cookie_str
|
||||
if to_dict:
|
||||
return json.loads(cookie_str)
|
||||
return cookie_str
|
||||
|
||||
async def get_cookie_to_redis(self) -> dict:
|
||||
"""
|
||||
直接获取cookie并塞到redis
|
||||
:return:
|
||||
"""
|
||||
cookie_obj = await self.new_session_id()
|
||||
await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
|
||||
return cookie_obj
|
||||
|
||||
async def test(self):
|
||||
from loguru import logger
|
||||
cookie_obj = await self.new_session_id()
|
||||
logger.info("""
|
||||
cookie: %s""" % cookie_obj)
|
||||
res = await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
|
||||
logger.info("""
|
||||
插入: %s""" % res)
|
||||
res = await self.get_cookie_from_redis()
|
||||
logger.info("""
|
||||
获取: %s""" % res)
|
||||
|
||||
def main(self):
|
||||
asyncio.run(self.test())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
GetSessionID().main()
|
||||
@ -0,0 +1,110 @@
|
||||
# Scrapy settings for science_article_cssci project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "science_article_cssci"
|
||||
|
||||
SPIDER_MODULES = ["science_article_cssci.spiders"]
|
||||
NEWSPIDER_MODULE = "science_article_cssci.spiders"
|
||||
|
||||
ADDONS = {}
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Concurrency and throttling settings
|
||||
#CONCURRENT_REQUESTS = 16
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||
DOWNLOAD_DELAY = 1
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
#DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
#}
|
||||
POST_HEADERS_CONFIG = {
|
||||
# 'content-type': 'application/x-www-form-urlencoded',
|
||||
'host': 'cssci.nju.edu.cn',
|
||||
'origin': 'http://cssci.nju.edu.cn',
|
||||
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||
'user-agent': USER_AGENT
|
||||
}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# "science_article_cssci.middlewares.ScienceArticleCssciSpiderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# "science_article_cssci.middlewares.ScienceArticleCssciDownloaderMiddleware": 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
#ITEM_PIPELINES = {
|
||||
# "science_article_cssci.pipelines.ScienceArticleCssciPipeline": 300,
|
||||
#}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
|
||||
MONGO_DATABASE = 'science2'
|
||||
|
||||
MONGO_URI_SCIENCE = "mongodb://root:kcidea1509!%25)(@43.140.203.187:27017/"
|
||||
MONGO_DATABASE_SCIENCE = 'science'
|
||||
|
||||
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
|
||||
|
||||
# cookie redis 配置
|
||||
COOKIE_POOL_CONFIG = REDIS_URL
|
||||
COOKIE_POOL_REDIS_KEY = 'cookies_pool:cssci:session'
|
||||
COOKIE_REDIS_TTL = 60 * 60 * 6
|
||||
|
||||
KAFKA_SERVERS = ['hadoop01:9092', 'hadoop02:9092', 'hadoop03:9092']
|
||||
KAFKA_TOPIC = "test2kafka" #
|
||||
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@ -0,0 +1,63 @@
|
||||
import re
|
||||
import json
|
||||
import scrapy
|
||||
from scrapy_redis.spiders import RedisSpider
|
||||
from scrapy_redis.utils import bytes_to_str
|
||||
from science_article_cssci.utils import model
|
||||
from science_article_cssci.items import CssciArticleItem
|
||||
from science_article_cssci.configs import cssci as config
|
||||
|
||||
|
||||
class CssciArticleByIdSpider(RedisSpider):
|
||||
name = "cssci_article_by_id"
|
||||
custom_settings = dict(
|
||||
DOWNLOADER_MIDDLEWARES={
|
||||
"science_article_cssci.middlewares.CssciCookieMiddleware": 540,
|
||||
},
|
||||
ITEM_PIPELINES={
|
||||
"science_article_cssci.pipelines.BuildDetailPipeline": 300,
|
||||
"science_article_cssci.pipelines.MongoPipeline": 310,
|
||||
"science_article_cssci.pipelines.KafkaPipeline": 350,
|
||||
},
|
||||
# LOG_LEVEL="INFO"
|
||||
)
|
||||
|
||||
def make_request_from_data(self, data):
|
||||
data = bytes_to_str(data)
|
||||
data = json.loads(data)
|
||||
third_id = data.get("third_id")
|
||||
yield scrapy.FormRequest(
|
||||
url=config.CSSCI_ARTICLE_DETAIL_API, method="GET",
|
||||
formdata=model.get_article_detail_param(third_id=third_id), callback=self.parse_detail,
|
||||
meta={"third_id": third_id})
|
||||
|
||||
def parse_detail(self, response, **kwargs):
|
||||
def change_qi(tmp_str):
|
||||
return re.sub(r'^0|0$', '', tmp_str)
|
||||
|
||||
def change_string(tmp_str):
|
||||
tmp = tmp_str.split("aaa")
|
||||
tmp_z = []
|
||||
for t in tmp:
|
||||
if len(t) > 1:
|
||||
tmp_z.append(t)
|
||||
return tmp_z
|
||||
|
||||
# print(response)
|
||||
meta = response.meta
|
||||
third_id = meta['third_id']
|
||||
resp_json = response.json()
|
||||
contents: list = resp_json.get('contents', [])
|
||||
body = [c for c in contents if c.get("sno") == third_id]
|
||||
if body:
|
||||
content = body[0]
|
||||
else:
|
||||
content = {}
|
||||
d = dict(
|
||||
content=content,
|
||||
author=resp_json.get("author"),
|
||||
catation=resp_json.get("catation"),
|
||||
)
|
||||
self.logger.debug(d)
|
||||
article_item = CssciArticleItem(**dict(third_id=third_id, resp_raw=d))
|
||||
yield article_item
|
||||
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2024/11/13 11:19
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : __init__.py
|
||||
@ -0,0 +1,91 @@
|
||||
search_type = {
|
||||
15: '所有字段',
|
||||
1: '篇名(词)',
|
||||
17: '英文篇名',
|
||||
# 2:'篇名(词)(精确)',
|
||||
3: '作者',
|
||||
# 4:'作者(精确)',
|
||||
5: '作者(第一作者)',
|
||||
# 16:"作者(第一作者+精确)",
|
||||
6: "关键词",
|
||||
# 7:"关键词(精确)",
|
||||
8: '期刊名称',
|
||||
# 9:'期刊名称(精确)',
|
||||
10: '作者机构',
|
||||
# 11:'作者机构(第一机构)',
|
||||
# 12:'作者地区',
|
||||
13: '中图类号',
|
||||
14: '基金细节'
|
||||
}
|
||||
|
||||
search_type_z = {
|
||||
15: '所有字段',
|
||||
1: '篇名(词)',
|
||||
17: '英文篇名',
|
||||
2: '篇名(词)(精确)',
|
||||
3: '作者',
|
||||
4: '作者(精确)',
|
||||
5: '作者(第一作者)',
|
||||
16: "作者(第一作者+精确)",
|
||||
6: "关键词",
|
||||
7: "关键词(精确)",
|
||||
8: '期刊名称',
|
||||
9: '期刊名称(精确)',
|
||||
10: '作者机构',
|
||||
11: '作者机构(第一机构)',
|
||||
12: '作者地区',
|
||||
13: '中图类号',
|
||||
18: '期刊名称(精确)',
|
||||
14: '基金细节'
|
||||
}
|
||||
|
||||
search_type_s = {
|
||||
15: '所有字段',
|
||||
1: '篇名(词)',
|
||||
17: '英文篇名',
|
||||
# 2:'篇名(词)(精确)',
|
||||
3: '作者',
|
||||
# 4:'作者(精确)',
|
||||
# 5:'作者(第一作者)',
|
||||
# 16:"作者(第一作者+精确)",
|
||||
6: "关键词",
|
||||
# 7:"关键词(精确)",
|
||||
8: '期刊名称',
|
||||
# 9:'期刊名称(精确)',
|
||||
10: '作者机构',
|
||||
# 11:'作者机构(第一机构)',
|
||||
12: '作者地区',
|
||||
13: '中图类号',
|
||||
14: '基金细节'
|
||||
}
|
||||
search_type_ly = {
|
||||
4: '被引作者(精确)',
|
||||
3: '被引作者',
|
||||
5: '被引作者(排除自引)',
|
||||
1: '被引篇名(词)',
|
||||
2: '被引篇名(词)(精确)',
|
||||
6: '被引期刊名称',
|
||||
7: '被引期刊名称(精确)',
|
||||
# 8:'期刊名称(排除自引)',
|
||||
9: '被引文献细节'
|
||||
}
|
||||
|
||||
search_type_ly_x = {
|
||||
# 4:'被引作者(精确)',
|
||||
3: '被引作者',
|
||||
5: '被引作者(排除自引)',
|
||||
1: '被引篇名(词)',
|
||||
# 2:'被引篇名(词)(精确)',
|
||||
6: '被引期刊名称',
|
||||
# 7:'被引期刊名称(精确)',
|
||||
# 8:'期刊名称(排除自引)',
|
||||
9: '被引文献细节'
|
||||
}
|
||||
|
||||
order_value = {
|
||||
'nian': "年代",
|
||||
'lypm ': "篇名(词)",
|
||||
# 'nian':"被引次数",
|
||||
'bz': "作者"
|
||||
# 'nian':"相关度"
|
||||
}
|
||||
Loading…
Reference in New Issue