You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
4.7 KiB
Python

8 months ago
# -*- coding: utf-8 -*-
# @date2023/12/29 8:58
# @AuthorLiuYiJie
# @file test_spider
"""爬虫模块,包含`SpiderMeta`类和一些初始的
爬虫类如果用户需要定义自己的爬虫类必须要继承
`SpiderMeta`并重写`gets`方法`gets`
方法要求返回 ip:port 形式的代理
"""
import time
class SpiderMeta(type):
spiders = []
def _init(cls):
"""子类的构造方法
:return: None
"""
cls._counter = 1
def _increment(cls, count):
"""子类用于增加计数器的方法
:param count: 计数器增加量
:return: None
"""
cls._counter += count
def _flush(cls):
"""计数器刷新为 1
:return: None
"""
cls._counter = 1
def __new__(cls, *args, **kwargs):
"""构造子类
:param args: args[0] = name, args[1] = bases, args[2] = attrs.
:param kwargs: No.
:return: 新类
"""
# 爬虫类必须要有 `get` 方法。
if 'gets' not in args[2]:
raise ValueError(args[0])
# 给爬虫类添加一些默认方法
args[2]['__init__'] = lambda self: SpiderMeta._init(self)
args[2]['increment'] = lambda self, count: SpiderMeta._increment(self, count)
args[2]['flush'] = lambda self: SpiderMeta._flush(self)
# 将爬虫类加入到 `spiders` 列表中
SpiderMeta.spiders.append(type.__new__(cls, *args, **kwargs))
return type.__new__(cls, *args, **kwargs)
class Proxy360Spider(metaclass=SpiderMeta):
start_url = 'http://www.proxy360.cn/default.aspx'
def gets(self, page_total=None):
ans = []
soup = get_page(self.start_url)
for proxy in soup.find_all('div', {'class': 'proxylistitem'}):
item = proxy.find_all('span', {"class": "tbBottomLine"})
ip = item[0].get_text().replace('\r\n', '').replace(' ', '')
port = item[1].get_text().replace('\r\n', '').replace(' ', '')
ans.append(':'.join([ip, port]))
return ans
class Daili666Spider(metaclass=SpiderMeta):
start_url = 'http://www.66ip.cn/{}.html'
def gets(self, page_total=3):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
# 防止被 Ban, 加 1s 的间隔。
time.sleep(1)
proxy_list = soup.find('table', {"border": "2px"})
for proxy in proxy_list.find_all('tr')[1:]:
ip = proxy.find_all('td')[0].get_text()
port = proxy.find_all('td')[1].get_text()
ans.append(':'.join([ip, port]))
return ans
class KuaidailiSpider(metaclass=SpiderMeta):
start_url = 'http://www.kuaidaili.com/free/inha/{}/'
def gets(self, page_total=2):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
time.sleep(1)
proxy_list = soup.find('table',
{'class': 'table table-bordered table-striped'}) \
.find('tbody')
for proxy in proxy_list.find_all('tr'):
tmp = proxy.find_all('td')
ip = tmp[0].get_text()
port = tmp[1].get_text()
ans.append(':'.join([ip, port]))
return ans
class XiciSpider(metaclass=SpiderMeta):
start_url = 'http://www.xicidaili.com/nn/{}'
def gets(self, page_total=2):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
time.sleep(1)
proxy_list = soup.find('table', {'id': 'ip_list'}) \
.find_all('tr')[1:]
for proxy in proxy_list:
tmp = proxy.find_all('td')
ip = tmp[1].get_text()
port = tmp[2].get_text()
ans.append(':'.join([ip, port]))
return ans
from fastapi import FastAPI, Query
from urllib.parse import quote
import uvicorn
app = FastAPI()
@app.get("/search/")
def search_items(query_param: str = Query(..., description="Your query parameter")):
# 将查询参数进行 URL 编码
encoded_query_param = quote(query_param)
return {"result": f"Searching for {encoded_query_param}"}
if __name__ == '__main__':
uvicorn.run('test_spider:app', host='0.0.0.0', port=8080)