add:org_news commit to kc

main
zhaoxiangpeng 4 weeks ago
parent 2e0c44e983
commit 25ffff7604

@ -0,0 +1,25 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class OrgNewsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
pub_time = scrapy.Field()
news_link = scrapy.Field()
updated_at = scrapy.Field()
class OrgNewsDatabaseItem(OrgNewsItem):
title = scrapy.Field()
news_label = scrapy.Field()
news_content = scrapy.Field()
tags = scrapy.Field()
news_source = scrapy.Field()
spider_cls = scrapy.Field()

@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/21 17:29
# @Author : zhaoxiangpeng
# @File : main.py
from scrapy.cmdline import execute
execute('scrapy crawl org_news_distributed'.split())
# execute('scrapy crawl org_news_lib_test'.split())
# execute('scrapy crawl org_news_sjtu_lib'.split())

@ -0,0 +1,100 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class OrgNewsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
async def process_start(self, start):
# Called with an async iterator over the spider start() method or the
# maching method of an earlier spider middleware.
async for item_or_request in start:
yield item_or_request
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class OrgNewsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

@ -0,0 +1,67 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import re
import pymongo
from itemadapter import ItemAdapter
class OrgNewsPipeline:
def process_item(self, item, spider):
return item
class NewsTitleClassifyPipeline:
__KEYWORDS__ = dict(
Database=['开通', '试用', '停订', '新增', '时长'],
HumanAffairs=['现在馆长', '馆长更换']
)
keyword_db_pattern = re.compile('|'.join(__KEYWORDS__['Database']))
def process_item(self, item, spider):
adapter = ItemAdapter(item)
news_title = adapter.get("title")
tags1 = self.keyword_db_pattern.findall(news_title)
item['tags'] = tags1
return item
class NewsStandardPipeline:
content_standard_pattern = re.compile(r'[\r\n\s]')
def process_item(self, item, spider):
adapter = ItemAdapter(item)
news_content = adapter.get("news_content")
item['news_content'] = self.content_standard_pattern.sub('', news_content)
return item
class MongoPipeline:
collection_name = "data_org_news"
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get("MONGO_URI"),
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
return item

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/24 9:23
# @Author : zhaoxiangpeng
# @File : create_table.py
import pymongo
from pymongo import MongoClient
from pymongo.database import Database
def mongo():
client = MongoClient("mongodb://root:123456@192.168.1.211:27017/")
db = client['science2']
return db
def create_news_collection(db: Database):
collection = db.get_collection('data_org_news')
# collection.create_index(
# keys=[]
# )
collection.create_index(
keys=[
('news_source', pymongo.ASCENDING),
],
background=True,
name='_news_s'
) # 创建来源索引这个字段是list
if __name__ == '__main__':
mongo_db = mongo()
create_news_collection(mongo_db)

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/24 9:23
# @Author : zhaoxiangpeng
# @File : export_data.py

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/24 11:06
# @Author : zhaoxiangpeng
# @File : task_push.py
import json
import redis
import org_news.settings as settings
from org_news.utils.read_cfg import read_cfg, format_cfg
def do_test():
r = redis.StrictRedis.from_url(settings.REDIS_URL)
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
def main():
r = redis.StrictRedis.from_url(settings.REDIS_URL)
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
if __name__ == '__main__':
main()

@ -0,0 +1,61 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/23 11:27
# @Author : zhaoxiangpeng
# @File : a_insert_xpath.py
import json
from pprint import pprint
class CfgTemplate:
org_name = None
org_domain = None
second_org_name = None
second_org_domain = None
news_module = None
news_module_link = None
list_s = None
title_s = None
datetime_s = None
news_link_s = None
label_s = None
content_s = None
spider_cls = None
invalid = None
def insert2testCfgFile(s_cfg: dict):
pprint(s_cfg)
with open('library_selector_test.txt', 'w+', encoding='utf-8') as f:
f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
def insert2cfgFile(s_cfg: dict):
pprint(s_cfg)
with open('library_selector.txt', 'a+', encoding='utf-8') as f:
f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
def main():
module = CfgTemplate()
module.__setattr__('org_name', '西安交通大学') # 机构名称
module.__setattr__('org_domain', 'https://www.xjtu.edu.cn/') # 机构域名
module.__setattr__('second_org_name', '图书馆') # 二级机构名称
module.__setattr__('second_org_domain', 'http://www.lib.xjtu.edu.cn/') # 二级机构域名
module.__setattr__('news_module', '通知公告') # 新闻所在模块名称
module.__setattr__('news_module_link', 'http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362') # 新闻所在模块链接
module.__setattr__('list_s', None) # 新闻列表选择器
module.__setattr__('title_s', None) # 新闻标题选择器
module.__setattr__('news_link_s', None) # 新闻链接选择器
module.__setattr__('datetime_s', None) # 发布日期时间选择器
module.__setattr__('label_s', None) # 新闻标签选择器
module.__setattr__('content_s', None) # 正文内容选择器
module.__setattr__('spider_cls', 'org_news_lib_database') # 爬虫类
# module.__setattr__('invalid', 1) # 爬虫类
print(module)
insert2testCfgFile(module.__dict__)
insert2cfgFile(module.__dict__)
if __name__ == '__main__':
main()

@ -0,0 +1,22 @@
{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "新闻资讯", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=263", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=264", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "通知公告", "news_module_link": "https://lib.tsinghua.edu.cn/tzgg.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.tsinghua.edu.cn/zydt.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "活动日历", "news_module_link": "https://lib.tsinghua.edu.cn/hdrl.htm", "list_s": "//div[@id=\"kzyc2\"]//div[@class=\"rl-box\"]/ul//li", "title_s": "./div[contains(@class, \"rl-title\")]/a/text()", "news_link_s": "./div[contains(@class, \"rl-title\")]/a/@href", "datetime_s": "./div[@class=\"rl-label\"]/span[@class=\"rl-date\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"library-content-content\"]/div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "新闻", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/261xwlb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]/div[@class=\"sub_0261 ul-inline\"]/ul/li", "title_s": "./a/div/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "公告", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/262gglb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "资源", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/gm/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "活动", "news_module_link": "https://www.lib.pku.edu.cn/4whjy/45hdzljz/453qb/index.htm", "list_s": "//div[@class=\"wrap_sub0435\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./div/div/div[@class=\"bottom\"]/a/text()", "news_link_s": "./div/div/div[@class=\"bottom\"]/a/@href", "datetime_s": "./div/div/div[@class=\"bottom\"]/div[@class=\"info\"]/p/i[contains(@class, \"icon-shijianfuxing\")]/../text()", "label_s": "./div/div/a[contains(@class, \"type\")]/text()", "content_s": "string(//div[@class=\"content\"]/div/div[@class=\"wrap_sub_0262\"])", "spider_cls": "org_news_lib_browser", "invalid": 1}
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "通知公告", "news_module_link": "https://libweb.zju.edu.cn/39478/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://libweb.zju.edu.cn/55543/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "本馆新闻", "news_module_link": "https://libweb.zju.edu.cn/55989/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "新闻通告", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=3&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "资源动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=4&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "融媒体动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=26&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": null, "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.nju.edu.cn/dzzy/zydt1.htm", "list_s": "//div[@class=\"zydt\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"time\"]/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "新闻通知", "news_module_link": "https://lib.nju.edu.cn/xw/xwtz.htm", "list_s": "//div[@class=\"gqzx-list\"]/ul/li", "title_s": "./a/text()", "news_link_s": "./a/@href", "datetime_s": "./span/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/资源动态/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p/text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "服务公告", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/服务公告/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "讲座培训", "news_module_link": "https://lib.ustc.edu.cn/category/讲座培训/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "资源信息", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493363", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "通知公告", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}

@ -0,0 +1,101 @@
# Scrapy settings for org_news project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "org_news"
SPIDER_MODULES = ["org_news.spiders"]
NEWSPIDER_MODULE = "org_news.spiders"
ADDONS = {}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "org_news (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Concurrency and throttling settings
#CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 1
DOWNLOAD_DELAY = 1
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "org_news.middlewares.OrgNewsSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
RETRY_ENABLED = True
RETRY_TIMES = 2 # 重试3次
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
# "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"org_news.pipelines.OrgNewsPipeline": 300,
"org_news.pipelines.NewsTitleClassifyPipeline": 400,
"org_news.pipelines.NewsStandardPipeline": 410,
# "org_news.pipelines.MongoPipeline": 500,
}
MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
MONGO_DATABASE = "science2"
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
FEED_EXPORT_ENCODING = "utf-8"

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

@ -0,0 +1,92 @@
import logging
import json
from datetime import datetime
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy_redis.utils import bytes_to_str
from org_news.items import OrgNewsDatabaseItem
class OrgNewsDistributedSpider(RedisSpider):
name = "org_news_distributed"
custom_settings = dict( # 修改调度器
SCHEDULER="scrapy_redis.scheduler.Scheduler",
# 修改去重工具
DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter",
# 开启数据持久化
SCHEDULER_PERSIST=True,
# REDIS_HOST='192.168.1.211',
# REDIS_PORT=6379,
# # 验证数据库密码
# REDIS_PARAMS={
# 'password': 'kcidea1509',
# },
)
def make_request_from_data(self, data):
formatted_data = bytes_to_str(data, self.redis_encoding)
cfg = json.loads(formatted_data)
if cfg['spider_cls'] != 'org_news_lib_database':
self.logger.warning('不需要 %s 消费,跳过\n配置为:%s' % (self.name, cfg))
return []
list_selector = cfg.pop('list_selector')
detail_selector = cfg.pop('detail_selector')
yield scrapy.Request(url=cfg['news_module_link'], dont_filter=True,
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
callback=self.parse_news_list)
def parse_news_list(self, response):
"""
解析新闻列表页
"""
req_meta = response.meta
s_cfg = req_meta['s_cfg']
current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
list_selector = req_meta.get('list_selector')
nodes = response.xpath(list_selector['list_s'])
news_label_s = list_selector.get('label_s', None)
if not nodes:
self.logger.warning("""
当前采集模块%s
没有采集到新闻链接
资源页链接%s""" % (current_module, req_meta['s_cfg']['news_module_link']))
else:
self.logger.info("""
当前采集模块%s
没有采集到新闻链接 %s
资源页链接%s""" % (current_module, len(nodes), req_meta['s_cfg']['news_module_link']))
for node in nodes:
list_data = dict(
title=node.xpath(list_selector['title_s']).get(),
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
pub_link=list_selector['news_link_s'] and response.urljoin(
node.xpath(list_selector['news_link_s']).get()),
news_label=news_label_s and node.xpath(news_label_s).get()
)
if not list_data['pub_link']:
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
continue
yield response.follow(list_data['pub_link'],
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
detail_selector=req_meta['detail_selector']),
callback=self.parse_news_detail)
def parse_news_detail(self, response):
req_meta = response.meta
s_cfg = req_meta.get('s_cfg')
detail_selector = req_meta.get('detail_selector')
list_data = req_meta.get('list_data', {})
contents = response.xpath(detail_selector['content_s']).getall()
text = '\n'.join([s.strip() for s in contents])
news_item = OrgNewsDatabaseItem()
news_item['title'] = list_data.get('title')
news_item['pub_time'] = list_data.get('pub_time')
news_item['news_link'] = list_data.get('pub_link')
news_item['news_label'] = list_data.get('news_label', None)
news_item['news_content'] = text
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
news_item['spider_cls'] = s_cfg['spider_cls']
news_item['updated_at'] = datetime.now()
yield news_item

@ -0,0 +1,66 @@
import scrapy
from org_news.items import OrgNewsDatabaseItem
class OrgNewsFudanLibSpider(scrapy.Spider):
name = "org_news_fudan_lib"
allowed_domains = ["www.fudan.edu.cn"]
start_urls = ["https://www.fudan.edu.cn/"]
cfgs = [
dict(org_name='复旦大学',
org_domain='https://www.fudan.edu.cn/',
second_org_name='图书馆',
second_org_domain='https://library.fudan.edu.cn/',
news_module='资源动态',
news_module_link='https://library.fudan.edu.cn/zydtx/list.htm')
]
def start_requests(self):
yield scrapy.FormRequest(url="https://library.fudan.edu.cn/_wp3services/generalQuery?queryObj=articles",
formdata=dict(
siteId=928, rows=14, columnId=42893, pageIndex=1,
returnInfos='[{"field":"title","name":"title"},{"field":"modifyTime","pattern":[{"name":"d","value":"yyyy-MM-dd"}],"name":"modifyTime"}]',
conditions='[]', orders='[{"field":"modifyTime","type":"desc"}]'
))
def parse_news_list(self, response):
"""
解析新闻列表页
"""
req_meta = response.meta
list_selector = req_meta.get('list_selector')
nodes = response.xpath(list_selector['list_s'])
news_label_s = list_selector.get('label_s', None)
for node in nodes:
list_data = dict(
title=node.xpath(list_selector['title_s']).get(),
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
pub_link=list_selector['news_link_s'] and response.urljoin(
node.xpath(list_selector['news_link_s']).get()),
news_label=news_label_s and node.xpath(news_label_s).get()
)
if not list_data['pub_link']:
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
continue
yield response.follow(list_data['pub_link'],
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
detail_selector=req_meta['detail_selector']),
callback=self.parse_news_detail)
def parse_news_detail(self, response):
req_meta = response.meta
s_cfg = req_meta.get('s_cfg')
detail_selector = req_meta.get('detail_selector')
list_data = req_meta.get('list_data', {})
contents = response.xpath(detail_selector['content_s']).getall()
text = '\n'.join([s.strip() for s in contents])
news_item = OrgNewsDatabaseItem()
news_item['title'] = list_data.get('title')
news_item['pub_time'] = list_data.get('pub_time')
news_item['news_link'] = list_data.get('pub_link')
news_item['news_label'] = list_data.get('news_label', None)
news_item['news_content'] = text
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
news_item['spider_cls'] = s_cfg['spider_cls']
news_item['updated_at'] = datetime.now()
yield news_item

@ -0,0 +1,10 @@
import scrapy
class OrgNewsHustLibSpider(scrapy.Spider):
name = "org_news_hust_lib"
allowed_domains = ["www.hust.edu.cn"]
start_urls = ["https://www.hust.edu.cn/"]
def parse(self, response):
pass

@ -0,0 +1,10 @@
import scrapy
class OrgNewsLibBrowserSpider(scrapy.Spider):
name = "org_news_lib_browser"
allowed_domains = ["lib.edu.cn"]
start_urls = ["https://lib.edu.cn"]
def parse(self, response):
pass

@ -0,0 +1,76 @@
import logging
import json
from datetime import datetime
import scrapy
from org_news.items import OrgNewsDatabaseItem
from org_news.utils.read_cfg import read_cfg, format_cfg
class OrgNewsLibDatabaseSpider(scrapy.Spider):
name = "org_news_lib_database"
# allowed_domains = ["lib.edu.cn"]
# start_urls = ["https://lib.edu.cn"]
def start_requests(self):
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
if cfg['spider_cls'] != self.name: continue
cfg = format_cfg(cfg)
list_selector = cfg.pop('list_selector')
detail_selector = cfg.pop('detail_selector')
yield scrapy.Request(url=cfg['news_module_link'],
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
callback=self.parse_news_list)
def parse_news_list(self, response):
"""
解析新闻列表页
"""
req_meta = response.meta
s_cfg = req_meta['cfg']
current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
list_selector = req_meta.get('list_selector')
nodes = response.xpath(list_selector['list_s'])
news_label_s = list_selector.get('label_s', None)
if not nodes:
self.logger.warning("""
当前采集模块%s
没有采集到新闻链接
资源页链接%s""" % (current_module, req_meta['cfg']['news_module_link']))
else:
self.logger.info("""
当前采集模块%s
没有采集到新闻链接 %s
资源页链接%s""" % (current_module, len(nodes), req_meta['cfg']['news_module_link']))
for node in nodes:
list_data = dict(
title=node.xpath(list_selector['title_s']).get(),
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
pub_link=list_selector['news_link_s'] and response.urljoin(
node.xpath(list_selector['news_link_s']).get()),
news_label=news_label_s and node.xpath(news_label_s).get()
)
if not list_data['pub_link']:
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
continue
yield response.follow(list_data['pub_link'],
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
detail_selector=req_meta['detail_selector']),
callback=self.parse_news_detail)
def parse_news_detail(self, response):
req_meta = response.meta
s_cfg = req_meta.get('s_cfg')
detail_selector = req_meta.get('detail_selector')
list_data = req_meta.get('list_data', {})
contents = response.xpath(detail_selector['content_s']).getall()
text = '\n'.join([s.strip() for s in contents])
news_item = OrgNewsDatabaseItem()
news_item['title'] = list_data.get('title')
news_item['pub_time'] = list_data.get('pub_time')
news_item['news_link'] = list_data.get('pub_link')
news_item['news_label'] = list_data.get('news_label', None)
news_item['news_content'] = text
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
news_item['spider_cls'] = s_cfg['spider_cls']
news_item['updated_at'] = datetime.now()
yield news_item

@ -0,0 +1,19 @@
import scrapy
from org_news.utils.read_cfg import read_cfg
from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
class OrgNewsLibTestSpider(OrgNewsLibDatabaseSpider):
name = "org_news_lib_test"
def start_requests(self):
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
# if cfg['spider_cls'] != self.name: continue
list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None),
title_s=cfg.pop('title_s'),
datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
detail_selector = dict(content_s=cfg.pop('content_s'))
yield scrapy.Request(url=cfg['news_module_link'],
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
callback=self.parse_news_list)

@ -0,0 +1,39 @@
import logging
import scrapy
from scrapy_redis.spiders import RedisSpider
from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
from org_news.utils.read_cfg import read_cfg
from org_news.utils import tools
class OrgNewsSjtuLibSpider(OrgNewsLibDatabaseSpider):
name = "org_news_sjtu_lib"
# allowed_domains = ["www.lib.sjtu.edu.cn"]
# start_urls = ["https://www.lib.sjtu.edu.cn/"]
def parse_news_list(self, response):
"""
解析新闻列表页
"""
req_meta = response.meta
list_selector = req_meta.get('list_selector')
nodes = response.xpath(list_selector['list_s'])
news_label_s = list_selector.get('label_s', None)
for node in nodes:
list_data = dict(
title=node.xpath(list_selector['title_s']).get(),
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
pub_link=list_selector['news_link_s'] and response.urljoin(
node.xpath(list_selector['news_link_s']).get()),
news_label=news_label_s and node.xpath(news_label_s).get()
)
if not list_data['pub_link']:
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
continue
params = tools.url_parse(list_data['pub_link'])
yield response.follow('https://www.lib.sjtu.edu.cn/f/content/content.shtml?id=%s' % params.get('id'),
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
detail_selector=req_meta['detail_selector']),
callback=self.parse_news_detail)

@ -0,0 +1,10 @@
import scrapy
class OrgNewsWhuLibSpider(scrapy.Spider):
name = "org_news_whu_lib"
allowed_domains = ["www.whu.edu.cn"]
start_urls = ["https://www.whu.edu.cn/"]
def parse(self, response):
pass

@ -0,0 +1,113 @@
import logging
import json
from datetime import datetime
from urllib.parse import urlparse
import scrapy
from parsel.selector import Selector
from org_news.items import OrgNewsDatabaseItem
from org_news.utils.read_cfg import read_cfg, format_cfg
from org_news.utils import tools
FIELD_KEYS = [str(i) for i in range(7)]
engineInstanceId = '361785'
def find_value_by_key(key: str, obj: dict):
for idx in FIELD_KEYS:
x = obj[idx]
if isinstance(x, dict) and x.get('key') == key:
return x.get('value')
class OrgNewsXjtuLibSpider(scrapy.Spider):
name = "org_news_xjtu_lib"
allowed_domains = ["www.lib.xjtu.edu.cn"]
request_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{module_id}/type/more-datas'
detail_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{news_id}/detail'
start_urls = ["https://www.xjtu.edu.cn/"]
def start_requests(self):
for cfg in read_cfg(
'D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
if cfg['spider_cls'] != self.name: continue
cfg = format_cfg(cfg)
list_selector = cfg.pop('list_selector')
detail_selector = cfg.pop('detail_selector')
path = urlparse(cfg['news_module_link']).path
module_id = path.split('/')[2]
params = tools.url_parse(cfg['news_module_link'])
yield scrapy.FormRequest(self.request_api.format(module_id=module_id), method="POST", dont_filter=True,
formdata=dict(engineInstanceId=engineInstanceId, pageNum='1', pageSize='20',
typeId=params.get('typeId'), topTypeId='', sw=''),
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector,
websiteId=params.get('websiteId')),
callback=self.parse_news_list)
def parse_news_list(self, response):
"""
解析新闻列表页
"""
req_meta = response.meta
websiteId = req_meta.get('websiteId')
if response.status != 200:
self.log('响应状态码异常')
return
resp_text = response.text
resp_json = json.loads(resp_text)
if resp_json.get('status') != 200 or resp_json.get('message') != "请求正确响应":
self.log('响应内容异常')
return
nodes = resp_json.get('data', {}).get('datas', {}).get('datas')
for node in nodes:
list_data = dict(
title=find_value_by_key('标题', node),
pub_time=find_value_by_key('时间', node),
pub_link=node.get('url') and response.urljoin(node.get('url')),
news_label=None
)
if not list_data['pub_link']:
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
continue
payload = f'engineInstanceId={node.get("engineInstanceId", engineInstanceId)}&typeId={node.get("typeId")}&pageId=1&websiteId={websiteId}&currentBranch=0'
list_data['pub_link'] = pub_link = self.detail_api.format(news_id=node.get('id')) + '?' + payload
# self.logger.debug('publink: %s' % pub_link)
yield scrapy.Request(url=pub_link,
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
detail_selector=req_meta['detail_selector']),
callback=self.parse_news_detail)
def parse_news_detail(self, response):
req_meta = response.meta
s_cfg = req_meta.get('s_cfg')
list_data = req_meta.get('list_data', {})
contents = None
last_script = response.xpath('/html/script[last()]/text()')
if last_script:
"""
import re
re.findall(r'data: (\{"engineInstanceId".*?\}),\r\n', last_script, re.S | re.M)
"""
data_text = last_script.re_first(r'data: (\{"engineInstanceId".*?\}),\r\n')
try:
data_dic = json.loads(data_text)
selector = Selector(data_dic.get('content', ''), type='html')
except json.decoder.JSONDecodeError:
contents = None
else:
contents = selector.xpath('string(.)').get(None)
if not contents:
self.logger.warning("没有提取到数据")
news_item = OrgNewsDatabaseItem()
news_item['title'] = list_data.get('title')
news_item['pub_time'] = list_data.get('pub_time')
news_item['news_link'] = list_data.get('pub_link')
news_item['news_label'] = list_data.get('news_label', None)
news_item['news_content'] = contents
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
news_item['spider_cls'] = s_cfg['spider_cls']
news_item['updated_at'] = datetime.now()
yield news_item

@ -0,0 +1,27 @@
import scrapy
from org_news.items import OrgNewsItem
class SeuLibResourceDynamicsSpider(scrapy.Spider):
name = "seu_lib_resource_dynamics"
allowed_domains = ["lib.seu.edu.cn"]
start_urls = ["https://lib.seu.edu.cn/list.php?fid=264&page=1"]
def parse(self, response):
nodes = response.xpath('//div[@class="content-right-list"]/ul/li[@class="list-item"]')
for node in nodes:
list_data = dict(
title=node.xpath('./a/span/text()').get(),
pub_time=node.xpath('./span[@class="item-time"]/text()').get(),
pub_link=response.urljoin(node.xpath('./a/@href').get()),
)
yield response.follow(list_data['pub_link'], callback=self.parse_news_detail, meta=dict(list_data=list_data))
def parse_news_detail(self, response):
req_meta = response.meta
list_data = req_meta.get('list_data', {})
contents = response.xpath('string(//div[@class="article-wrap"])').getall()
text = '\n'.join([s.strip() for s in contents])
print(text)

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/24 14:42
# @Author : zhaoxiangpeng
# @File : __init__.py.py

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/23 15:02
# @Author : zhaoxiangpeng
# @File : read_cfg.py
import json
def read_cfg(file):
with open(file, 'r', encoding='utf-8') as f:
while line := f.readline():
yield json.loads(line)
def format_cfg(cfg: dict):
list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), title_s=cfg.pop('title_s'),
datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
detail_selector = dict(content_s=cfg.pop('content_s'))
new_cfg = cfg.copy()
new_cfg['list_selector'] = list_selector
new_cfg['detail_selector'] = detail_selector
return new_cfg
def test_read_cfg():
for cfg in read_cfg('/scrapy-demo1/org_news/org_news/selector_cfg/library_selector.txt'):
print(cfg)
print(format_cfg(cfg))
if __name__ == '__main__':
test_read_cfg()

@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/24 14:42
# @Author : zhaoxiangpeng
# @File : tools.py
from urllib.parse import urlparse, parse_qs
def url_parse(url: str):
"""
url解析为dict
:param url:
:return:
"""
query = urlparse(url).query
params = parse_qs(query)
result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params}
return result

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# @Time : 2025/7/25 13:39
# @Author : zhaoxiangpeng
# @File : run.py
import os
import re
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings
regex = re.compile(r'^(org_news_).*?(_lib)$')
def load_spider_script(path):
"""
加载特定的爬虫脚本
"""
scripts = os.listdir(path)
spiders = []
for script in scripts:
if not script.endswith('.py'):
continue
spider_name = script[:-3]
if bool(regex.search(spider_name)):
spiders.append(spider_name)
return spiders
process = CrawlerProcess(get_project_settings())
# process.crawl('org_news_sjtu_lib')
process.crawl('org_news_xjtu_lib')
process.start()

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = org_news.settings
[deploy]
#url = http://localhost:6800/
project = org_news
Loading…
Cancel
Save