Compare commits
No commits in common. '49317cac5558ad3fc7cc17f38e8f585aac95701a' and '598b4da46d32ee582f3e7c269776a634e7d730f4' have entirely different histories.
49317cac55
...
598b4da46d
@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2025/8/5 15:40
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : run.py
|
|
||||||
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
|||||||
# Automatically created by: scrapy startproject
|
|
||||||
#
|
|
||||||
# For more information about the [deploy] section see:
|
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
||||||
|
|
||||||
[settings]
|
|
||||||
default = wos.settings
|
|
||||||
|
|
||||||
[deploy]
|
|
||||||
#url = http://localhost:6800/
|
|
||||||
project = wos
|
|
@ -1,12 +0,0 @@
|
|||||||
# Define here the models for your scraped items
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class WosItem(scrapy.Item):
|
|
||||||
# define the fields for your item here like:
|
|
||||||
# name = scrapy.Field()
|
|
||||||
pass
|
|
@ -1,100 +0,0 @@
|
|||||||
# Define here the models for your spider middleware
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
|
|
||||||
from scrapy import signals
|
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
|
||||||
from itemadapter import ItemAdapter
|
|
||||||
|
|
||||||
|
|
||||||
class WosSpiderMiddleware:
|
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
|
||||||
# scrapy acts as if the spider middleware does not modify the
|
|
||||||
# passed objects.
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_crawler(cls, crawler):
|
|
||||||
# This method is used by Scrapy to create your spiders.
|
|
||||||
s = cls()
|
|
||||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def process_spider_input(self, response, spider):
|
|
||||||
# Called for each response that goes through the spider
|
|
||||||
# middleware and into the spider.
|
|
||||||
|
|
||||||
# Should return None or raise an exception.
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_spider_output(self, response, result, spider):
|
|
||||||
# Called with the results returned from the Spider, after
|
|
||||||
# it has processed the response.
|
|
||||||
|
|
||||||
# Must return an iterable of Request, or item objects.
|
|
||||||
for i in result:
|
|
||||||
yield i
|
|
||||||
|
|
||||||
def process_spider_exception(self, response, exception, spider):
|
|
||||||
# Called when a spider or process_spider_input() method
|
|
||||||
# (from other spider middleware) raises an exception.
|
|
||||||
|
|
||||||
# Should return either None or an iterable of Request or item objects.
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def process_start(self, start):
|
|
||||||
# Called with an async iterator over the spider start() method or the
|
|
||||||
# maching method of an earlier spider middleware.
|
|
||||||
async for item_or_request in start:
|
|
||||||
yield item_or_request
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
|
||||||
|
|
||||||
|
|
||||||
class WosDownloaderMiddleware:
|
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
|
||||||
# scrapy acts as if the downloader middleware does not modify the
|
|
||||||
# passed objects.
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_crawler(cls, crawler):
|
|
||||||
# This method is used by Scrapy to create your spiders.
|
|
||||||
s = cls()
|
|
||||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
|
||||||
# Called for each request that goes through the downloader
|
|
||||||
# middleware.
|
|
||||||
|
|
||||||
# Must either:
|
|
||||||
# - return None: continue processing this request
|
|
||||||
# - or return a Response object
|
|
||||||
# - or return a Request object
|
|
||||||
# - or raise IgnoreRequest: process_exception() methods of
|
|
||||||
# installed downloader middleware will be called
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
|
||||||
# Called with the response returned from the downloader.
|
|
||||||
|
|
||||||
# Must either;
|
|
||||||
# - return a Response object
|
|
||||||
# - return a Request object
|
|
||||||
# - or raise IgnoreRequest
|
|
||||||
return response
|
|
||||||
|
|
||||||
def process_exception(self, request, exception, spider):
|
|
||||||
# Called when a download handler or a process_request()
|
|
||||||
# (from other downloader middleware) raises an exception.
|
|
||||||
|
|
||||||
# Must either:
|
|
||||||
# - return None: continue processing this exception
|
|
||||||
# - return a Response object: stops process_exception() chain
|
|
||||||
# - return a Request object: stops process_exception() chain
|
|
||||||
pass
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -1,13 +0,0 @@
|
|||||||
# Define your item pipelines here
|
|
||||||
#
|
|
||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
||||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
|
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
|
||||||
from itemadapter import ItemAdapter
|
|
||||||
|
|
||||||
|
|
||||||
class WosPipeline:
|
|
||||||
def process_item(self, item, spider):
|
|
||||||
return item
|
|
@ -1,4 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# @Time : 2025/8/5 15:43
|
|
||||||
# @Author : zhaoxiangpeng
|
|
||||||
# @File : push_cscd_task.py
|
|
@ -1,87 +0,0 @@
|
|||||||
# Scrapy settings for wos project
|
|
||||||
#
|
|
||||||
# For simplicity, this file contains only settings considered important or
|
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
|
||||||
#
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
|
|
||||||
BOT_NAME = "wos"
|
|
||||||
|
|
||||||
SPIDER_MODULES = ["wos.spiders"]
|
|
||||||
NEWSPIDER_MODULE = "wos.spiders"
|
|
||||||
|
|
||||||
ADDONS = {}
|
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
|
||||||
#USER_AGENT = "wos (+http://www.yourdomain.com)"
|
|
||||||
|
|
||||||
# Obey robots.txt rules
|
|
||||||
ROBOTSTXT_OBEY = True
|
|
||||||
|
|
||||||
# Concurrency and throttling settings
|
|
||||||
#CONCURRENT_REQUESTS = 16
|
|
||||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
|
||||||
DOWNLOAD_DELAY = 1
|
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
|
||||||
#COOKIES_ENABLED = False
|
|
||||||
|
|
||||||
# Disable Telnet Console (enabled by default)
|
|
||||||
#TELNETCONSOLE_ENABLED = False
|
|
||||||
|
|
||||||
# Override the default request headers:
|
|
||||||
#DEFAULT_REQUEST_HEADERS = {
|
|
||||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
||||||
# "Accept-Language": "en",
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable spider middlewares
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
#SPIDER_MIDDLEWARES = {
|
|
||||||
# "wos.middlewares.WosSpiderMiddleware": 543,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
||||||
#DOWNLOADER_MIDDLEWARES = {
|
|
||||||
# "wos.middlewares.WosDownloaderMiddleware": 543,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable extensions
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
|
||||||
#EXTENSIONS = {
|
|
||||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Configure item pipelines
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
#ITEM_PIPELINES = {
|
|
||||||
# "wos.pipelines.WosPipeline": 300,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
|
||||||
#AUTOTHROTTLE_ENABLED = True
|
|
||||||
# The initial download delay
|
|
||||||
#AUTOTHROTTLE_START_DELAY = 5
|
|
||||||
# The maximum download delay to be set in case of high latencies
|
|
||||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
|
||||||
# each remote server
|
|
||||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
||||||
# Enable showing throttling stats for every response received:
|
|
||||||
#AUTOTHROTTLE_DEBUG = False
|
|
||||||
|
|
||||||
# Enable and configure HTTP caching (disabled by default)
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
|
||||||
#HTTPCACHE_ENABLED = True
|
|
||||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
|
||||||
#HTTPCACHE_DIR = "httpcache"
|
|
||||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
|
||||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
|
||||||
|
|
||||||
# Set settings whose default value is deprecated to a future-proof value
|
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -1,4 +0,0 @@
|
|||||||
# This package will contain the spiders of your Scrapy project
|
|
||||||
#
|
|
||||||
# Please refer to the documentation for information on how to create and manage
|
|
||||||
# your spiders.
|
|
@ -1,20 +0,0 @@
|
|||||||
import json
|
|
||||||
import scrapy
|
|
||||||
from scrapy_redis.spiders import RedisSpider
|
|
||||||
from scrapy_redis.utils import bytes_to_str
|
|
||||||
|
|
||||||
|
|
||||||
class WosCscdIncrementSpider(RedisSpider):
|
|
||||||
name = "wos_cscd_increment"
|
|
||||||
# allowed_domains = ["example.com"]
|
|
||||||
# start_urls = ["https://example.com"]
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def make_request_from_data(self, data):
|
|
||||||
formatted_data = bytes_to_str(data, self.redis_encoding)
|
|
||||||
cfg = json.loads(formatted_data)
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
@ -1,11 +0,0 @@
|
|||||||
# Automatically created by: scrapy startproject
|
|
||||||
#
|
|
||||||
# For more information about the [deploy] section see:
|
|
||||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
||||||
|
|
||||||
[settings]
|
|
||||||
default = wos_cscd.settings
|
|
||||||
|
|
||||||
[deploy]
|
|
||||||
#url = http://localhost:6800/
|
|
||||||
project = wos_cscd
|
|
@ -1,12 +0,0 @@
|
|||||||
# Define here the models for your scraped items
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
|
||||||
|
|
||||||
import scrapy
|
|
||||||
|
|
||||||
|
|
||||||
class WosCscdItem(scrapy.Item):
|
|
||||||
# define the fields for your item here like:
|
|
||||||
# name = scrapy.Field()
|
|
||||||
pass
|
|
@ -1,100 +0,0 @@
|
|||||||
# Define here the models for your spider middleware
|
|
||||||
#
|
|
||||||
# See documentation in:
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
|
|
||||||
from scrapy import signals
|
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
|
||||||
from itemadapter import ItemAdapter
|
|
||||||
|
|
||||||
|
|
||||||
class WosCscdSpiderMiddleware:
|
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
|
||||||
# scrapy acts as if the spider middleware does not modify the
|
|
||||||
# passed objects.
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_crawler(cls, crawler):
|
|
||||||
# This method is used by Scrapy to create your spiders.
|
|
||||||
s = cls()
|
|
||||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def process_spider_input(self, response, spider):
|
|
||||||
# Called for each response that goes through the spider
|
|
||||||
# middleware and into the spider.
|
|
||||||
|
|
||||||
# Should return None or raise an exception.
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_spider_output(self, response, result, spider):
|
|
||||||
# Called with the results returned from the Spider, after
|
|
||||||
# it has processed the response.
|
|
||||||
|
|
||||||
# Must return an iterable of Request, or item objects.
|
|
||||||
for i in result:
|
|
||||||
yield i
|
|
||||||
|
|
||||||
def process_spider_exception(self, response, exception, spider):
|
|
||||||
# Called when a spider or process_spider_input() method
|
|
||||||
# (from other spider middleware) raises an exception.
|
|
||||||
|
|
||||||
# Should return either None or an iterable of Request or item objects.
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def process_start(self, start):
|
|
||||||
# Called with an async iterator over the spider start() method or the
|
|
||||||
# maching method of an earlier spider middleware.
|
|
||||||
async for item_or_request in start:
|
|
||||||
yield item_or_request
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
|
||||||
|
|
||||||
|
|
||||||
class WosCscdDownloaderMiddleware:
|
|
||||||
# Not all methods need to be defined. If a method is not defined,
|
|
||||||
# scrapy acts as if the downloader middleware does not modify the
|
|
||||||
# passed objects.
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_crawler(cls, crawler):
|
|
||||||
# This method is used by Scrapy to create your spiders.
|
|
||||||
s = cls()
|
|
||||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
|
||||||
return s
|
|
||||||
|
|
||||||
def process_request(self, request, spider):
|
|
||||||
# Called for each request that goes through the downloader
|
|
||||||
# middleware.
|
|
||||||
|
|
||||||
# Must either:
|
|
||||||
# - return None: continue processing this request
|
|
||||||
# - or return a Response object
|
|
||||||
# - or return a Request object
|
|
||||||
# - or raise IgnoreRequest: process_exception() methods of
|
|
||||||
# installed downloader middleware will be called
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_response(self, request, response, spider):
|
|
||||||
# Called with the response returned from the downloader.
|
|
||||||
|
|
||||||
# Must either;
|
|
||||||
# - return a Response object
|
|
||||||
# - return a Request object
|
|
||||||
# - or raise IgnoreRequest
|
|
||||||
return response
|
|
||||||
|
|
||||||
def process_exception(self, request, exception, spider):
|
|
||||||
# Called when a download handler or a process_request()
|
|
||||||
# (from other downloader middleware) raises an exception.
|
|
||||||
|
|
||||||
# Must either:
|
|
||||||
# - return None: continue processing this exception
|
|
||||||
# - return a Response object: stops process_exception() chain
|
|
||||||
# - return a Request object: stops process_exception() chain
|
|
||||||
pass
|
|
||||||
|
|
||||||
def spider_opened(self, spider):
|
|
||||||
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -1,13 +0,0 @@
|
|||||||
# Define your item pipelines here
|
|
||||||
#
|
|
||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
||||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
|
|
||||||
|
|
||||||
# useful for handling different item types with a single interface
|
|
||||||
from itemadapter import ItemAdapter
|
|
||||||
|
|
||||||
|
|
||||||
class WosCscdPipeline:
|
|
||||||
def process_item(self, item, spider):
|
|
||||||
return item
|
|
@ -1,87 +0,0 @@
|
|||||||
# Scrapy settings for wos_cscd project
|
|
||||||
#
|
|
||||||
# For simplicity, this file contains only settings considered important or
|
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
|
||||||
#
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
|
|
||||||
BOT_NAME = "wos_cscd"
|
|
||||||
|
|
||||||
SPIDER_MODULES = ["wos_cscd.spiders"]
|
|
||||||
NEWSPIDER_MODULE = "wos_cscd.spiders"
|
|
||||||
|
|
||||||
ADDONS = {}
|
|
||||||
|
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
|
||||||
#USER_AGENT = "wos_cscd (+http://www.yourdomain.com)"
|
|
||||||
|
|
||||||
# Obey robots.txt rules
|
|
||||||
ROBOTSTXT_OBEY = True
|
|
||||||
|
|
||||||
# Concurrency and throttling settings
|
|
||||||
#CONCURRENT_REQUESTS = 16
|
|
||||||
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
|
||||||
DOWNLOAD_DELAY = 1
|
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
|
||||||
#COOKIES_ENABLED = False
|
|
||||||
|
|
||||||
# Disable Telnet Console (enabled by default)
|
|
||||||
#TELNETCONSOLE_ENABLED = False
|
|
||||||
|
|
||||||
# Override the default request headers:
|
|
||||||
#DEFAULT_REQUEST_HEADERS = {
|
|
||||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
||||||
# "Accept-Language": "en",
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable spider middlewares
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
#SPIDER_MIDDLEWARES = {
|
|
||||||
# "wos_cscd.middlewares.WosCscdSpiderMiddleware": 543,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
||||||
#DOWNLOADER_MIDDLEWARES = {
|
|
||||||
# "wos_cscd.middlewares.WosCscdDownloaderMiddleware": 543,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable or disable extensions
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
|
||||||
#EXTENSIONS = {
|
|
||||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Configure item pipelines
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
#ITEM_PIPELINES = {
|
|
||||||
# "wos_cscd.pipelines.WosCscdPipeline": 300,
|
|
||||||
#}
|
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
|
||||||
#AUTOTHROTTLE_ENABLED = True
|
|
||||||
# The initial download delay
|
|
||||||
#AUTOTHROTTLE_START_DELAY = 5
|
|
||||||
# The maximum download delay to be set in case of high latencies
|
|
||||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
|
||||||
# each remote server
|
|
||||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
||||||
# Enable showing throttling stats for every response received:
|
|
||||||
#AUTOTHROTTLE_DEBUG = False
|
|
||||||
|
|
||||||
# Enable and configure HTTP caching (disabled by default)
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
|
||||||
#HTTPCACHE_ENABLED = True
|
|
||||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
|
||||||
#HTTPCACHE_DIR = "httpcache"
|
|
||||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
|
||||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
|
||||||
|
|
||||||
# Set settings whose default value is deprecated to a future-proof value
|
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
|
@ -1,4 +0,0 @@
|
|||||||
# This package will contain the spiders of your Scrapy project
|
|
||||||
#
|
|
||||||
# Please refer to the documentation for information on how to create and manage
|
|
||||||
# your spiders.
|
|
Loading…
Reference in New Issue