cssci spider
parent
3e3d237388
commit
21963b7f80
@ -0,0 +1,32 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2026/1/19 16:51
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : cssci.py
|
||||||
|
# 数据来源名
|
||||||
|
SOURCE_NAME = 'cssci'
|
||||||
|
|
||||||
|
|
||||||
|
# api配置
|
||||||
|
CSSCI_CONTROL_API = 'http://cssci.nju.edu.cn/control/controllers.php'
|
||||||
|
# 搜索列表的接口
|
||||||
|
CSSCI_SEARCH_API = CSSCI_CONTROL_API
|
||||||
|
# 详情页接口
|
||||||
|
CSSCI_ARTICLE_DETAIL_API = CSSCI_CONTROL_API
|
||||||
|
|
||||||
|
# 请求headers配置
|
||||||
|
POST_HEADERS_CONFIG = {
|
||||||
|
'content-type': 'application/x-www-form-urlencoded',
|
||||||
|
'host': 'cssci.nju.edu.cn',
|
||||||
|
'origin': 'http://cssci.nju.edu.cn',
|
||||||
|
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 任务条件
|
||||||
|
TASK_CONDITION_CONFIG = dict(
|
||||||
|
task_table='task_search_strategy',
|
||||||
|
task_field=["id", "content", "param"],
|
||||||
|
task_condition="source_type = 8",
|
||||||
|
db_type="mysql",
|
||||||
|
batch_limit=1
|
||||||
|
)
|
||||||
@ -0,0 +1,50 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2026/1/16 14:11
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : kafka.py
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from aiokafka import AIOKafkaProducer
|
||||||
|
from kafka.errors import KafkaError
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class KafkaUtil:
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
BASE_DELAY = 2 # 秒:指数退避基础
|
||||||
|
|
||||||
|
def __init__(self, producer):
|
||||||
|
self._kafka_producer: AIOKafkaProducer = producer
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
async def producer_send(self, data, topic):
|
||||||
|
try:
|
||||||
|
future = await self._kafka_producer.send(topic, value=data)
|
||||||
|
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
|
||||||
|
future.add_errback(self.on_send_error_factory(data, topic))
|
||||||
|
except KafkaError as e:
|
||||||
|
self.logger.error(f'{data.get("id", "")} - 生产失败\n'
|
||||||
|
f'失败原因:{e}')
|
||||||
|
|
||||||
|
def on_send_success(self, record_metadata, data):
|
||||||
|
msg_id = data.get("id", "unknown_id")
|
||||||
|
self.logger.info(
|
||||||
|
f"{msg_id} - 成功发送到: "
|
||||||
|
f"topic={record_metadata.topic} - partition={record_metadata.partition} - offset={record_metadata.offset}")
|
||||||
|
|
||||||
|
def on_send_error_factory(self, data, topic, retries=0):
|
||||||
|
msg_id = data.get("id", "unknown_id")
|
||||||
|
|
||||||
|
def on_send_error(exc):
|
||||||
|
self.logger.info(f"{msg_id} - 第 {retries + 1} 次发送失败: {exc}")
|
||||||
|
if retries < self.MAX_RETRIES:
|
||||||
|
delay = self.BASE_DELAY * (2 ** retries)
|
||||||
|
self.logger.info(f"{msg_id} - {delay}s 后重试,第 {retries + 1}/{self.MAX_RETRIES} 次")
|
||||||
|
time.sleep(delay)
|
||||||
|
future = self._kafka_producer.send(topic, value=data)
|
||||||
|
future.add_callback(lambda metadata: self.on_send_success(metadata, data))
|
||||||
|
future.add_errback(self.on_send_error_factory(data, topic, retries + 1))
|
||||||
|
else:
|
||||||
|
self.logger.error(f"{msg_id} - 超过最大重试次数, 请检查")
|
||||||
|
|
||||||
|
return on_send_error
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class ScienceArticleCssciItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class AddItemBase(scrapy.Item):
|
||||||
|
third_id = scrapy.Field()
|
||||||
|
updated_at = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleItem(AddItemBase):
|
||||||
|
exported = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleCitedItem(AddItemBase):
|
||||||
|
cited = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class CssciArticleItem(ArticleItem):
|
||||||
|
__tablename__ = 'data_cssci_article'
|
||||||
|
|
||||||
|
third_id = scrapy.Field()
|
||||||
|
resp_raw = scrapy.Field()
|
||||||
|
detailed = scrapy.Field()
|
||||||
|
updated_at = scrapy.Field()
|
||||||
@ -0,0 +1,143 @@
|
|||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
from scrapy.http.headers import Headers
|
||||||
|
|
||||||
|
from science_article_cssci.scripts.get_cookie import GetSessionID
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ScienceArticleCssciSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_start(self, start):
|
||||||
|
# Called with an async iterator over the spider start() method or the
|
||||||
|
# maching method of an earlier spider middleware.
|
||||||
|
async for item_or_request in start:
|
||||||
|
yield item_or_request
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class ScienceArticleCssciDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class CssciCookieMiddleware:
|
||||||
|
ss: Optional[GetSessionID]
|
||||||
|
|
||||||
|
def __init__(self, custom_headers: dict, cookie_cfg: dict):
|
||||||
|
self.custom_headers = custom_headers
|
||||||
|
self.headers = Headers(self.custom_headers)
|
||||||
|
self.cookies_pool_config = cookie_cfg
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
settings = crawler.settings
|
||||||
|
post_headers = crawler.settings.getdict('POST_HEADERS_CONFIG')
|
||||||
|
s = cls(
|
||||||
|
custom_headers=post_headers,
|
||||||
|
cookie_cfg=dict(
|
||||||
|
redis_uri=settings.get("COOKIE_POOL_CONFIG"),
|
||||||
|
pool_key=settings.get("COOKIE_POOL_REDIS_KEY"),
|
||||||
|
ttl=settings.get("COOKIE_REDIS_TTL")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
self.ss = GetSessionID(**self.cookies_pool_config)
|
||||||
|
|
||||||
|
async def process_request(self, request, spider):
|
||||||
|
cookie_1 = await self.ss.get_cookie_from_redis()
|
||||||
|
if not cookie_1:
|
||||||
|
cookie_1 = await self.ss.get_cookie_to_redis()
|
||||||
|
logger.info("""
|
||||||
|
没有可用cookie
|
||||||
|
重新获取: %s""" % cookie_1)
|
||||||
|
|
||||||
|
request.cookies = cookie_1
|
||||||
|
request.headers = self.headers
|
||||||
@ -0,0 +1,62 @@
|
|||||||
|
[source_type]
|
||||||
|
1 = 论文
|
||||||
|
2 = 综述
|
||||||
|
3 = 评论
|
||||||
|
5 = 报告
|
||||||
|
4 = 传记资料
|
||||||
|
9 = 其他
|
||||||
|
|
||||||
|
[source_jj]
|
||||||
|
1 = 国家自科基金
|
||||||
|
2 = 国家社科基金
|
||||||
|
3 = 国家级其它基金
|
||||||
|
4 = 教育部基金
|
||||||
|
5 = 其他部委级基金
|
||||||
|
6 = 中科院基金
|
||||||
|
7 = 社科院基金
|
||||||
|
8 = 省(市)级基金
|
||||||
|
9 = 其它基金
|
||||||
|
|
||||||
|
[yw_type]
|
||||||
|
1 = 期刊论文
|
||||||
|
11 = 电子文献
|
||||||
|
10 = 法规
|
||||||
|
9 = 标准
|
||||||
|
8 = 报告
|
||||||
|
7 = 汇编
|
||||||
|
6 = 信件
|
||||||
|
5 = 学位论文
|
||||||
|
4 = 会议文献
|
||||||
|
3 = 报纸
|
||||||
|
2 = 图书
|
||||||
|
99 = 其他
|
||||||
|
|
||||||
|
[source_xk]
|
||||||
|
630 = 管理学
|
||||||
|
850 = 民族学
|
||||||
|
860 = 新闻学与传播学
|
||||||
|
870 = 图书馆、情报与文献学
|
||||||
|
880 = 教育学
|
||||||
|
890 = 体育学
|
||||||
|
910 = 统计学
|
||||||
|
920 = 心理学
|
||||||
|
930 = 社会科学总论
|
||||||
|
940 = 军事学
|
||||||
|
950 = 文化学
|
||||||
|
960 = 人文、经济地理
|
||||||
|
970 = 环境科学
|
||||||
|
840 = 社会学
|
||||||
|
820 = 法学
|
||||||
|
710 = 马克思主义
|
||||||
|
720 = 哲学
|
||||||
|
730 = 宗教学
|
||||||
|
740 = 语言学
|
||||||
|
009 = 文学
|
||||||
|
751 = 外国文学
|
||||||
|
752 = 中国文学
|
||||||
|
760 = 艺术学
|
||||||
|
770 = 历史学
|
||||||
|
780 = 考古学
|
||||||
|
790 = 经济学
|
||||||
|
810 = 政治学
|
||||||
|
999 = 其他学科
|
||||||
@ -0,0 +1,106 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/12 16:10
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : get_cookie.py
|
||||||
|
# 用来获取session_id
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
from random import random
|
||||||
|
|
||||||
|
import redis
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
|
||||||
|
class GetSessionID:
|
||||||
|
__redis_cli = None
|
||||||
|
|
||||||
|
def __init__(self, redis_uri: str, pool_key: str, ttl: int = None, **kwargs):
|
||||||
|
self.redis_uri = redis_uri
|
||||||
|
self.pool_key = pool_key
|
||||||
|
self.ttl = ttl
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def new_session_id() -> dict:
|
||||||
|
session = aiohttp.ClientSession()
|
||||||
|
resp = await session.get(
|
||||||
|
'http://cssci.nju.edu.cn/index.html',
|
||||||
|
headers={
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
|
||||||
|
'host': 'cssci.nju.edu.cn'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
assert resp.status == 200
|
||||||
|
resp = await session.post(
|
||||||
|
'http://cssci.nju.edu.cn/control/controllers.php',
|
||||||
|
headers={
|
||||||
|
'content-type': 'application/x-www-form-urlencoded',
|
||||||
|
'host': 'cssci.nju.edu.cn',
|
||||||
|
'origin': 'http://cssci.nju.edu.cn',
|
||||||
|
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
|
||||||
|
},
|
||||||
|
data=dict(control='user_control', action='check_user_online', rand=random()),
|
||||||
|
)
|
||||||
|
assert resp.status == 200
|
||||||
|
# 把cookiejar转为dict并返回
|
||||||
|
cookie_obj = dict()
|
||||||
|
cookie_jar = session.cookie_jar.filter_cookies(resp.url)
|
||||||
|
for key, jar in cookie_jar.items():
|
||||||
|
cookie_obj.setdefault(jar.key, jar.value)
|
||||||
|
# 关闭session
|
||||||
|
await session.close()
|
||||||
|
return cookie_obj
|
||||||
|
|
||||||
|
@property
|
||||||
|
def redis_cli(self):
|
||||||
|
if self.__class__.__redis_cli is None:
|
||||||
|
self.__class__.__redis_cli = redis.asyncio.Redis.from_url(
|
||||||
|
self.redis_uri,
|
||||||
|
decode_responses=True
|
||||||
|
)
|
||||||
|
return self.__class__.__redis_cli
|
||||||
|
|
||||||
|
async def set_cookie_to_redis(self, val):
|
||||||
|
result = await self.redis_cli.setex(self.pool_key, time=self.ttl, value=val)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def get_cookie_from_redis(self, to_dict=True):
|
||||||
|
"""
|
||||||
|
:param to_dict: 是否从字符串转为dict
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
cookie_str = await self.redis_cli.get(self.pool_key)
|
||||||
|
if not cookie_str:
|
||||||
|
return cookie_str
|
||||||
|
if to_dict:
|
||||||
|
return json.loads(cookie_str)
|
||||||
|
return cookie_str
|
||||||
|
|
||||||
|
async def get_cookie_to_redis(self) -> dict:
|
||||||
|
"""
|
||||||
|
直接获取cookie并塞到redis
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
cookie_obj = await self.new_session_id()
|
||||||
|
await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
|
||||||
|
return cookie_obj
|
||||||
|
|
||||||
|
async def test(self):
|
||||||
|
from loguru import logger
|
||||||
|
cookie_obj = await self.new_session_id()
|
||||||
|
logger.info("""
|
||||||
|
cookie: %s""" % cookie_obj)
|
||||||
|
res = await self.set_cookie_to_redis(val=json.dumps(cookie_obj, ensure_ascii=False))
|
||||||
|
logger.info("""
|
||||||
|
插入: %s""" % res)
|
||||||
|
res = await self.get_cookie_from_redis()
|
||||||
|
logger.info("""
|
||||||
|
获取: %s""" % res)
|
||||||
|
|
||||||
|
def main(self):
|
||||||
|
asyncio.run(self.test())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
GetSessionID().main()
|
||||||
@ -0,0 +1,110 @@
|
|||||||
|
# Scrapy settings for science_article_cssci project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "science_article_cssci"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["science_article_cssci.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "science_article_cssci.spiders"
|
||||||
|
|
||||||
|
ADDONS = {}
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Concurrency and throttling settings
|
||||||
|
#CONCURRENT_REQUESTS = 16
|
||||||
|
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||||
|
DOWNLOAD_DELAY = 1
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
POST_HEADERS_CONFIG = {
|
||||||
|
# 'content-type': 'application/x-www-form-urlencoded',
|
||||||
|
'host': 'cssci.nju.edu.cn',
|
||||||
|
'origin': 'http://cssci.nju.edu.cn',
|
||||||
|
'referer': 'http://cssci.nju.edu.cn/index.html',
|
||||||
|
'user-agent': USER_AGENT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "science_article_cssci.middlewares.ScienceArticleCssciSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "science_article_cssci.middlewares.ScienceArticleCssciDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
#ITEM_PIPELINES = {
|
||||||
|
# "science_article_cssci.pipelines.ScienceArticleCssciPipeline": 300,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
|
MONGO_URI = "mongodb://science-dev:kcidea1509!%25)(@101.43.239.105:27017/?authSource=science&directConnection=true"
|
||||||
|
MONGO_DATABASE = 'science2'
|
||||||
|
|
||||||
|
MONGO_URI_SCIENCE = "mongodb://root:kcidea1509!%25)(@43.140.203.187:27017/"
|
||||||
|
MONGO_DATABASE_SCIENCE = 'science'
|
||||||
|
|
||||||
|
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
|
||||||
|
|
||||||
|
# cookie redis 配置
|
||||||
|
COOKIE_POOL_CONFIG = REDIS_URL
|
||||||
|
COOKIE_POOL_REDIS_KEY = 'cookies_pool:cssci:session'
|
||||||
|
COOKIE_REDIS_TTL = 60 * 60 * 6
|
||||||
|
|
||||||
|
KAFKA_SERVERS = ['hadoop01:9092', 'hadoop02:9092', 'hadoop03:9092']
|
||||||
|
KAFKA_TOPIC = "test2kafka" #
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
||||||
@ -0,0 +1,63 @@
|
|||||||
|
import re
|
||||||
|
import json
|
||||||
|
import scrapy
|
||||||
|
from scrapy_redis.spiders import RedisSpider
|
||||||
|
from scrapy_redis.utils import bytes_to_str
|
||||||
|
from science_article_cssci.utils import model
|
||||||
|
from science_article_cssci.items import CssciArticleItem
|
||||||
|
from science_article_cssci.configs import cssci as config
|
||||||
|
|
||||||
|
|
||||||
|
class CssciArticleByIdSpider(RedisSpider):
|
||||||
|
name = "cssci_article_by_id"
|
||||||
|
custom_settings = dict(
|
||||||
|
DOWNLOADER_MIDDLEWARES={
|
||||||
|
"science_article_cssci.middlewares.CssciCookieMiddleware": 540,
|
||||||
|
},
|
||||||
|
ITEM_PIPELINES={
|
||||||
|
"science_article_cssci.pipelines.BuildDetailPipeline": 300,
|
||||||
|
"science_article_cssci.pipelines.MongoPipeline": 310,
|
||||||
|
"science_article_cssci.pipelines.KafkaPipeline": 350,
|
||||||
|
},
|
||||||
|
# LOG_LEVEL="INFO"
|
||||||
|
)
|
||||||
|
|
||||||
|
def make_request_from_data(self, data):
|
||||||
|
data = bytes_to_str(data)
|
||||||
|
data = json.loads(data)
|
||||||
|
third_id = data.get("third_id")
|
||||||
|
yield scrapy.FormRequest(
|
||||||
|
url=config.CSSCI_ARTICLE_DETAIL_API, method="GET",
|
||||||
|
formdata=model.get_article_detail_param(third_id=third_id), callback=self.parse_detail,
|
||||||
|
meta={"third_id": third_id})
|
||||||
|
|
||||||
|
def parse_detail(self, response, **kwargs):
|
||||||
|
def change_qi(tmp_str):
|
||||||
|
return re.sub(r'^0|0$', '', tmp_str)
|
||||||
|
|
||||||
|
def change_string(tmp_str):
|
||||||
|
tmp = tmp_str.split("aaa")
|
||||||
|
tmp_z = []
|
||||||
|
for t in tmp:
|
||||||
|
if len(t) > 1:
|
||||||
|
tmp_z.append(t)
|
||||||
|
return tmp_z
|
||||||
|
|
||||||
|
# print(response)
|
||||||
|
meta = response.meta
|
||||||
|
third_id = meta['third_id']
|
||||||
|
resp_json = response.json()
|
||||||
|
contents: list = resp_json.get('contents', [])
|
||||||
|
body = [c for c in contents if c.get("sno") == third_id]
|
||||||
|
if body:
|
||||||
|
content = body[0]
|
||||||
|
else:
|
||||||
|
content = {}
|
||||||
|
d = dict(
|
||||||
|
content=content,
|
||||||
|
author=resp_json.get("author"),
|
||||||
|
catation=resp_json.get("catation"),
|
||||||
|
)
|
||||||
|
self.logger.debug(d)
|
||||||
|
article_item = CssciArticleItem(**dict(third_id=third_id, resp_raw=d))
|
||||||
|
yield article_item
|
||||||
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2024/11/13 11:19
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : __init__.py
|
||||||
@ -0,0 +1,91 @@
|
|||||||
|
search_type = {
|
||||||
|
15: '所有字段',
|
||||||
|
1: '篇名(词)',
|
||||||
|
17: '英文篇名',
|
||||||
|
# 2:'篇名(词)(精确)',
|
||||||
|
3: '作者',
|
||||||
|
# 4:'作者(精确)',
|
||||||
|
5: '作者(第一作者)',
|
||||||
|
# 16:"作者(第一作者+精确)",
|
||||||
|
6: "关键词",
|
||||||
|
# 7:"关键词(精确)",
|
||||||
|
8: '期刊名称',
|
||||||
|
# 9:'期刊名称(精确)',
|
||||||
|
10: '作者机构',
|
||||||
|
# 11:'作者机构(第一机构)',
|
||||||
|
# 12:'作者地区',
|
||||||
|
13: '中图类号',
|
||||||
|
14: '基金细节'
|
||||||
|
}
|
||||||
|
|
||||||
|
search_type_z = {
|
||||||
|
15: '所有字段',
|
||||||
|
1: '篇名(词)',
|
||||||
|
17: '英文篇名',
|
||||||
|
2: '篇名(词)(精确)',
|
||||||
|
3: '作者',
|
||||||
|
4: '作者(精确)',
|
||||||
|
5: '作者(第一作者)',
|
||||||
|
16: "作者(第一作者+精确)",
|
||||||
|
6: "关键词",
|
||||||
|
7: "关键词(精确)",
|
||||||
|
8: '期刊名称',
|
||||||
|
9: '期刊名称(精确)',
|
||||||
|
10: '作者机构',
|
||||||
|
11: '作者机构(第一机构)',
|
||||||
|
12: '作者地区',
|
||||||
|
13: '中图类号',
|
||||||
|
18: '期刊名称(精确)',
|
||||||
|
14: '基金细节'
|
||||||
|
}
|
||||||
|
|
||||||
|
search_type_s = {
|
||||||
|
15: '所有字段',
|
||||||
|
1: '篇名(词)',
|
||||||
|
17: '英文篇名',
|
||||||
|
# 2:'篇名(词)(精确)',
|
||||||
|
3: '作者',
|
||||||
|
# 4:'作者(精确)',
|
||||||
|
# 5:'作者(第一作者)',
|
||||||
|
# 16:"作者(第一作者+精确)",
|
||||||
|
6: "关键词",
|
||||||
|
# 7:"关键词(精确)",
|
||||||
|
8: '期刊名称',
|
||||||
|
# 9:'期刊名称(精确)',
|
||||||
|
10: '作者机构',
|
||||||
|
# 11:'作者机构(第一机构)',
|
||||||
|
12: '作者地区',
|
||||||
|
13: '中图类号',
|
||||||
|
14: '基金细节'
|
||||||
|
}
|
||||||
|
search_type_ly = {
|
||||||
|
4: '被引作者(精确)',
|
||||||
|
3: '被引作者',
|
||||||
|
5: '被引作者(排除自引)',
|
||||||
|
1: '被引篇名(词)',
|
||||||
|
2: '被引篇名(词)(精确)',
|
||||||
|
6: '被引期刊名称',
|
||||||
|
7: '被引期刊名称(精确)',
|
||||||
|
# 8:'期刊名称(排除自引)',
|
||||||
|
9: '被引文献细节'
|
||||||
|
}
|
||||||
|
|
||||||
|
search_type_ly_x = {
|
||||||
|
# 4:'被引作者(精确)',
|
||||||
|
3: '被引作者',
|
||||||
|
5: '被引作者(排除自引)',
|
||||||
|
1: '被引篇名(词)',
|
||||||
|
# 2:'被引篇名(词)(精确)',
|
||||||
|
6: '被引期刊名称',
|
||||||
|
# 7:'被引期刊名称(精确)',
|
||||||
|
# 8:'期刊名称(排除自引)',
|
||||||
|
9: '被引文献细节'
|
||||||
|
}
|
||||||
|
|
||||||
|
order_value = {
|
||||||
|
'nian': "年代",
|
||||||
|
'lypm ': "篇名(词)",
|
||||||
|
# 'nian':"被引次数",
|
||||||
|
'bz': "作者"
|
||||||
|
# 'nian':"相关度"
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue