You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
4.7 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding: utf-8 -*-
# @date2023/12/29 8:58
# @AuthorLiuYiJie
# @file test_spider
"""爬虫模块,包含`SpiderMeta`类和一些初始的
爬虫类,如果用户需要定义自己的爬虫类,必须要继承
`SpiderMeta`类,并重写`gets`方法,`gets`
方法要求返回 ip:port 形式的代理。
"""
import time
class SpiderMeta(type):
spiders = []
def _init(cls):
"""子类的构造方法
:return: None
"""
cls._counter = 1
def _increment(cls, count):
"""子类用于增加计数器的方法
:param count: 计数器增加量
:return: None
"""
cls._counter += count
def _flush(cls):
"""计数器刷新为 1
:return: None
"""
cls._counter = 1
def __new__(cls, *args, **kwargs):
"""构造子类
:param args: args[0] = name, args[1] = bases, args[2] = attrs.
:param kwargs: No.
:return: 新类
"""
# 爬虫类必须要有 `get` 方法。
if 'gets' not in args[2]:
raise ValueError(args[0])
# 给爬虫类添加一些默认方法
args[2]['__init__'] = lambda self: SpiderMeta._init(self)
args[2]['increment'] = lambda self, count: SpiderMeta._increment(self, count)
args[2]['flush'] = lambda self: SpiderMeta._flush(self)
# 将爬虫类加入到 `spiders` 列表中
SpiderMeta.spiders.append(type.__new__(cls, *args, **kwargs))
return type.__new__(cls, *args, **kwargs)
class Proxy360Spider(metaclass=SpiderMeta):
start_url = 'http://www.proxy360.cn/default.aspx'
def gets(self, page_total=None):
ans = []
soup = get_page(self.start_url)
for proxy in soup.find_all('div', {'class': 'proxylistitem'}):
item = proxy.find_all('span', {"class": "tbBottomLine"})
ip = item[0].get_text().replace('\r\n', '').replace(' ', '')
port = item[1].get_text().replace('\r\n', '').replace(' ', '')
ans.append(':'.join([ip, port]))
return ans
class Daili666Spider(metaclass=SpiderMeta):
start_url = 'http://www.66ip.cn/{}.html'
def gets(self, page_total=3):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
# 防止被 Ban, 加 1s 的间隔。
time.sleep(1)
proxy_list = soup.find('table', {"border": "2px"})
for proxy in proxy_list.find_all('tr')[1:]:
ip = proxy.find_all('td')[0].get_text()
port = proxy.find_all('td')[1].get_text()
ans.append(':'.join([ip, port]))
return ans
class KuaidailiSpider(metaclass=SpiderMeta):
start_url = 'http://www.kuaidaili.com/free/inha/{}/'
def gets(self, page_total=2):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
time.sleep(1)
proxy_list = soup.find('table',
{'class': 'table table-bordered table-striped'}) \
.find('tbody')
for proxy in proxy_list.find_all('tr'):
tmp = proxy.find_all('td')
ip = tmp[0].get_text()
port = tmp[1].get_text()
ans.append(':'.join([ip, port]))
return ans
class XiciSpider(metaclass=SpiderMeta):
start_url = 'http://www.xicidaili.com/nn/{}'
def gets(self, page_total=2):
urls = [self.start_url.format(i)
for i in range(self._counter, self._counter + page_total)]
self.increment(page_total)
ans = []
for url in urls:
soup = get_page(url)
time.sleep(1)
proxy_list = soup.find('table', {'id': 'ip_list'}) \
.find_all('tr')[1:]
for proxy in proxy_list:
tmp = proxy.find_all('td')
ip = tmp[1].get_text()
port = tmp[2].get_text()
ans.append(':'.join([ip, port]))
return ans
from fastapi import FastAPI, Query
from urllib.parse import quote
import uvicorn
app = FastAPI()
@app.get("/search/")
def search_items(query_param: str = Query(..., description="Your query parameter")):
# 将查询参数进行 URL 编码
encoded_query_param = quote(query_param)
return {"result": f"Searching for {encoded_query_param}"}
if __name__ == '__main__':
uvicorn.run('test_spider:app', host='0.0.0.0', port=8080)