From 25ffff7604afdc6a0d0427af9b3eddf92a3bca03 Mon Sep 17 00:00:00 2001
From: zhaoxiangpeng <1943364377@qq.com>
Date: Tue, 5 Aug 2025 15:39:23 +0800
Subject: [PATCH] add:org_news commit to kc

---
 org_news/org_news/__init__.py                 |   0
 org_news/org_news/items.py                    |  25 ++++
 org_news/org_news/main.py                     |  11 ++
 org_news/org_news/middlewares.py              | 100 ++++++++++++++++
 org_news/org_news/pipelines.py                |  67 +++++++++++
 org_news/org_news/scripts/create_table.py     |  33 +++++
 org_news/org_news/scripts/export_data.py      |   4 +
 org_news/org_news/scripts/task_push.py        |  26 ++++
 .../org_news/selector_cfg/a_insert_xpath.py   |  61 ++++++++++
 .../selector_cfg/library_selector.txt         |  22 ++++
 org_news/org_news/settings.py                 | 101 ++++++++++++++++
 org_news/org_news/spiders/__init__.py         |   4 +
 .../org_news/spiders/org_news_distributed.py  |  92 ++++++++++++++
 .../org_news/spiders/org_news_fudan_lib.py    |  66 ++++++++++
 .../org_news/spiders/org_news_hust_lib.py     |  10 ++
 .../org_news/spiders/org_news_lib_browser.py  |  10 ++
 .../org_news/spiders/org_news_lib_database.py |  76 ++++++++++++
 .../org_news/spiders/org_news_lib_test.py     |  19 +++
 .../org_news/spiders/org_news_sjtu_lib.py     |  39 ++++++
 org_news/org_news/spiders/org_news_whu_lib.py |  10 ++
 .../org_news/spiders/org_news_xjtu_lib.py     | 113 ++++++++++++++++++
 .../spiders/seu_lib_resource_dynamics.py      |  27 +++++
 org_news/org_news/utils/__init__.py           |   4 +
 org_news/org_news/utils/read_cfg.py           |  32 +++++
 org_news/org_news/utils/tools.py              |  18 +++
 org_news/run.py                               |  32 +++++
 org_news/scrapy.cfg                           |  11 ++
 27 files changed, 1013 insertions(+)
 create mode 100644 org_news/org_news/__init__.py
 create mode 100644 org_news/org_news/items.py
 create mode 100644 org_news/org_news/main.py
 create mode 100644 org_news/org_news/middlewares.py
 create mode 100644 org_news/org_news/pipelines.py
 create mode 100644 org_news/org_news/scripts/create_table.py
 create mode 100644 org_news/org_news/scripts/export_data.py
 create mode 100644 org_news/org_news/scripts/task_push.py
 create mode 100644 org_news/org_news/selector_cfg/a_insert_xpath.py
 create mode 100644 org_news/org_news/selector_cfg/library_selector.txt
 create mode 100644 org_news/org_news/settings.py
 create mode 100644 org_news/org_news/spiders/__init__.py
 create mode 100644 org_news/org_news/spiders/org_news_distributed.py
 create mode 100644 org_news/org_news/spiders/org_news_fudan_lib.py
 create mode 100644 org_news/org_news/spiders/org_news_hust_lib.py
 create mode 100644 org_news/org_news/spiders/org_news_lib_browser.py
 create mode 100644 org_news/org_news/spiders/org_news_lib_database.py
 create mode 100644 org_news/org_news/spiders/org_news_lib_test.py
 create mode 100644 org_news/org_news/spiders/org_news_sjtu_lib.py
 create mode 100644 org_news/org_news/spiders/org_news_whu_lib.py
 create mode 100644 org_news/org_news/spiders/org_news_xjtu_lib.py
 create mode 100644 org_news/org_news/spiders/seu_lib_resource_dynamics.py
 create mode 100644 org_news/org_news/utils/__init__.py
 create mode 100644 org_news/org_news/utils/read_cfg.py
 create mode 100644 org_news/org_news/utils/tools.py
 create mode 100644 org_news/run.py
 create mode 100644 org_news/scrapy.cfg

diff --git a/org_news/org_news/__init__.py b/org_news/org_news/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/org_news/org_news/items.py b/org_news/org_news/items.py
new file mode 100644
index 0000000..4aeed24
--- /dev/null
+++ b/org_news/org_news/items.py
@@ -0,0 +1,25 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class OrgNewsItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    pub_time = scrapy.Field()
+    news_link = scrapy.Field()
+
+    updated_at = scrapy.Field()
+
+
+class OrgNewsDatabaseItem(OrgNewsItem):
+    title = scrapy.Field()
+    news_label = scrapy.Field()
+    news_content = scrapy.Field()
+    tags = scrapy.Field()
+    news_source = scrapy.Field()
+    spider_cls = scrapy.Field()
diff --git a/org_news/org_news/main.py b/org_news/org_news/main.py
new file mode 100644
index 0000000..52beec7
--- /dev/null
+++ b/org_news/org_news/main.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/21 17:29
+# @Author  : zhaoxiangpeng
+# @File    : main.py
+
+from scrapy.cmdline import execute
+
+
+execute('scrapy crawl org_news_distributed'.split())
+# execute('scrapy crawl org_news_lib_test'.split())
+# execute('scrapy crawl org_news_sjtu_lib'.split())
diff --git a/org_news/org_news/middlewares.py b/org_news/org_news/middlewares.py
new file mode 100644
index 0000000..90c6786
--- /dev/null
+++ b/org_news/org_news/middlewares.py
@@ -0,0 +1,100 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class OrgNewsSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    async def process_start(self, start):
+        # Called with an async iterator over the spider start() method or the
+        # maching method of an earlier spider middleware.
+        async for item_or_request in start:
+            yield item_or_request
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class OrgNewsDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/org_news/org_news/pipelines.py b/org_news/org_news/pipelines.py
new file mode 100644
index 0000000..55b7313
--- /dev/null
+++ b/org_news/org_news/pipelines.py
@@ -0,0 +1,67 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+
+import re
+import pymongo
+from itemadapter import ItemAdapter
+
+
+class OrgNewsPipeline:
+    def process_item(self, item, spider):
+        return item
+
+
+class NewsTitleClassifyPipeline:
+    __KEYWORDS__ = dict(
+        Database=['开通', '试用', '停订', '新增', '时长'],
+        HumanAffairs=['现在馆长', '馆长更换']
+    )
+    keyword_db_pattern = re.compile('|'.join(__KEYWORDS__['Database']))
+
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        news_title = adapter.get("title")
+        tags1 = self.keyword_db_pattern.findall(news_title)
+        item['tags'] = tags1
+        return item
+
+
+class NewsStandardPipeline:
+    content_standard_pattern = re.compile(r'[\r\n\s]')
+
+    def process_item(self, item, spider):
+        adapter = ItemAdapter(item)
+        news_content = adapter.get("news_content")
+        item['news_content'] = self.content_standard_pattern.sub('', news_content)
+        return item
+
+
+class MongoPipeline:
+    collection_name = "data_org_news"
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
+        return item
diff --git a/org_news/org_news/scripts/create_table.py b/org_news/org_news/scripts/create_table.py
new file mode 100644
index 0000000..e6a0772
--- /dev/null
+++ b/org_news/org_news/scripts/create_table.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/24 9:23
+# @Author  : zhaoxiangpeng
+# @File    : create_table.py
+
+import pymongo
+from pymongo import MongoClient
+from pymongo.database import Database
+
+
+def mongo():
+    client = MongoClient("mongodb://root:123456@192.168.1.211:27017/")
+    db = client['science2']
+    return db
+
+
+def create_news_collection(db: Database):
+    collection = db.get_collection('data_org_news')
+    # collection.create_index(
+    #     keys=[]
+    # )
+    collection.create_index(
+        keys=[
+            ('news_source', pymongo.ASCENDING),
+        ],
+        background=True,
+        name='_news_s'
+    )  # 创建来源索引，这个字段是list
+
+
+if __name__ == '__main__':
+    mongo_db = mongo()
+    create_news_collection(mongo_db)
diff --git a/org_news/org_news/scripts/export_data.py b/org_news/org_news/scripts/export_data.py
new file mode 100644
index 0000000..0ffa96f
--- /dev/null
+++ b/org_news/org_news/scripts/export_data.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/24 9:23
+# @Author  : zhaoxiangpeng
+# @File    : export_data.py
diff --git a/org_news/org_news/scripts/task_push.py b/org_news/org_news/scripts/task_push.py
new file mode 100644
index 0000000..ddd477b
--- /dev/null
+++ b/org_news/org_news/scripts/task_push.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/24 11:06
+# @Author  : zhaoxiangpeng
+# @File    : task_push.py
+
+import json
+import redis
+
+import org_news.settings as settings
+from org_news.utils.read_cfg import read_cfg, format_cfg
+
+
+def do_test():
+    r = redis.StrictRedis.from_url(settings.REDIS_URL)
+    for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
+        r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
+
+
+def main():
+    r = redis.StrictRedis.from_url(settings.REDIS_URL)
+    for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
+        r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/org_news/org_news/selector_cfg/a_insert_xpath.py b/org_news/org_news/selector_cfg/a_insert_xpath.py
new file mode 100644
index 0000000..3ae3ddc
--- /dev/null
+++ b/org_news/org_news/selector_cfg/a_insert_xpath.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/23 11:27
+# @Author  : zhaoxiangpeng
+# @File    : a_insert_xpath.py
+
+import json
+from pprint import pprint
+
+
+class CfgTemplate:
+    org_name = None
+    org_domain = None
+    second_org_name = None
+    second_org_domain = None
+    news_module = None
+    news_module_link = None
+    list_s = None
+    title_s = None
+    datetime_s = None
+    news_link_s = None
+    label_s = None
+    content_s = None
+    spider_cls = None
+    invalid = None
+
+
+def insert2testCfgFile(s_cfg: dict):
+    pprint(s_cfg)
+    with open('library_selector_test.txt', 'w+', encoding='utf-8') as f:
+        f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
+
+
+def insert2cfgFile(s_cfg: dict):
+    pprint(s_cfg)
+    with open('library_selector.txt', 'a+', encoding='utf-8') as f:
+        f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
+
+
+def main():
+    module = CfgTemplate()
+    module.__setattr__('org_name', '西安交通大学')  # 机构名称
+    module.__setattr__('org_domain', 'https://www.xjtu.edu.cn/')  # 机构域名
+    module.__setattr__('second_org_name', '图书馆')  # 二级机构名称
+    module.__setattr__('second_org_domain', 'http://www.lib.xjtu.edu.cn/')  # 二级机构域名
+    module.__setattr__('news_module', '通知公告')  # 新闻所在模块名称
+    module.__setattr__('news_module_link', 'http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362')  # 新闻所在模块链接
+    module.__setattr__('list_s', None)  # 新闻列表选择器
+    module.__setattr__('title_s', None)  # 新闻标题选择器
+    module.__setattr__('news_link_s', None)  # 新闻链接选择器
+    module.__setattr__('datetime_s', None)  # 发布日期时间选择器
+    module.__setattr__('label_s', None)  # 新闻标签选择器
+    module.__setattr__('content_s', None)  # 正文内容选择器
+    module.__setattr__('spider_cls', 'org_news_lib_database')  # 爬虫类
+    # module.__setattr__('invalid', 1)  # 爬虫类
+    print(module)
+    insert2testCfgFile(module.__dict__)
+    insert2cfgFile(module.__dict__)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/org_news/org_news/selector_cfg/library_selector.txt b/org_news/org_news/selector_cfg/library_selector.txt
new file mode 100644
index 0000000..8a45897
--- /dev/null
+++ b/org_news/org_news/selector_cfg/library_selector.txt
@@ -0,0 +1,22 @@
+{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "新闻资讯", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=263", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=264", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "通知公告", "news_module_link": "https://lib.tsinghua.edu.cn/tzgg.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.tsinghua.edu.cn/zydt.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "活动日历", "news_module_link": "https://lib.tsinghua.edu.cn/hdrl.htm", "list_s": "//div[@id=\"kzyc2\"]//div[@class=\"rl-box\"]/ul//li", "title_s": "./div[contains(@class, \"rl-title\")]/a/text()", "news_link_s": "./div[contains(@class, \"rl-title\")]/a/@href", "datetime_s": "./div[@class=\"rl-label\"]/span[@class=\"rl-date\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"library-content-content\"]/div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "新闻", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/261xwlb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]/div[@class=\"sub_0261 ul-inline\"]/ul/li", "title_s": "./a/div/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
+{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "公告", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/262gglb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
+{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "资源", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/gm/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
+{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "活动", "news_module_link": "https://www.lib.pku.edu.cn/4whjy/45hdzljz/453qb/index.htm", "list_s": "//div[@class=\"wrap_sub0435\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./div/div/div[@class=\"bottom\"]/a/text()", "news_link_s": "./div/div/div[@class=\"bottom\"]/a/@href", "datetime_s": "./div/div/div[@class=\"bottom\"]/div[@class=\"info\"]/p/i[contains(@class, \"icon-shijianfuxing\")]/../text()", "label_s": "./div/div/a[contains(@class, \"type\")]/text()", "content_s": "string(//div[@class=\"content\"]/div/div[@class=\"wrap_sub_0262\"])", "spider_cls": "org_news_lib_browser", "invalid": 1}
+{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "通知公告", "news_module_link": "https://libweb.zju.edu.cn/39478/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://libweb.zju.edu.cn/55543/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "本馆新闻", "news_module_link": "https://libweb.zju.edu.cn/55989/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "新闻通告", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=3&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
+{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "资源动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=4&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
+{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "融媒体动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=26&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": null, "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
+{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.nju.edu.cn/dzzy/zydt1.htm", "list_s": "//div[@class=\"zydt\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"time\"]/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "新闻通知", "news_module_link": "https://lib.nju.edu.cn/xw/xwtz.htm", "list_s": "//div[@class=\"gqzx-list\"]/ul/li", "title_s": "./a/text()", "news_link_s": "./a/@href", "datetime_s": "./span/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/资源动态/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p/text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "服务公告", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/服务公告/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "讲座培训", "news_module_link": "https://lib.ustc.edu.cn/category/讲座培训/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
+{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "资源信息", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493363", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
+{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "通知公告", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
diff --git a/org_news/org_news/settings.py b/org_news/org_news/settings.py
new file mode 100644
index 0000000..5c2885c
--- /dev/null
+++ b/org_news/org_news/settings.py
@@ -0,0 +1,101 @@
+# Scrapy settings for org_news project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "org_news"
+
+SPIDER_MODULES = ["org_news.spiders"]
+NEWSPIDER_MODULE = "org_news.spiders"
+
+ADDONS = {}
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "org_news (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Concurrency and throttling settings
+#CONCURRENT_REQUESTS = 16
+CONCURRENT_REQUESTS_PER_DOMAIN = 1
+DOWNLOAD_DELAY = 1
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+   "Accept-Language": "en",
+   "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "org_news.middlewares.OrgNewsSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+
+RETRY_ENABLED = True
+RETRY_TIMES = 2  # 重试3次
+RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404]  # 增加了一些常见的错误码
+DOWNLOADER_MIDDLEWARES = {
+   'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
+   # "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   "org_news.pipelines.OrgNewsPipeline": 300,
+   "org_news.pipelines.NewsTitleClassifyPipeline": 400,
+   "org_news.pipelines.NewsStandardPipeline": 410,
+   # "org_news.pipelines.MongoPipeline": 500,
+}
+
+MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
+MONGO_DATABASE = "science2"
+
+REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+FEED_EXPORT_ENCODING = "utf-8"
diff --git a/org_news/org_news/spiders/__init__.py b/org_news/org_news/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/org_news/org_news/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/org_news/org_news/spiders/org_news_distributed.py b/org_news/org_news/spiders/org_news_distributed.py
new file mode 100644
index 0000000..648a394
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_distributed.py
@@ -0,0 +1,92 @@
+import logging
+import json
+from datetime import datetime
+import scrapy
+from scrapy_redis.spiders import RedisSpider
+from scrapy_redis.utils import bytes_to_str
+from org_news.items import OrgNewsDatabaseItem
+
+
+class OrgNewsDistributedSpider(RedisSpider):
+    name = "org_news_distributed"
+
+    custom_settings = dict(  # 修改调度器
+        SCHEDULER="scrapy_redis.scheduler.Scheduler",
+        # 修改去重工具
+        DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter",
+        # 开启数据持久化
+        SCHEDULER_PERSIST=True,
+        # REDIS_HOST='192.168.1.211',
+        # REDIS_PORT=6379,
+        # # 验证数据库密码
+        # REDIS_PARAMS={
+        #     'password': 'kcidea1509',
+        # },
+    )
+
+    def make_request_from_data(self, data):
+        formatted_data = bytes_to_str(data, self.redis_encoding)
+        cfg = json.loads(formatted_data)
+
+        if cfg['spider_cls'] != 'org_news_lib_database':
+            self.logger.warning('不需要 %s 消费，跳过\n配置为：%s' % (self.name, cfg))
+            return []
+        list_selector = cfg.pop('list_selector')
+        detail_selector = cfg.pop('detail_selector')
+        yield scrapy.Request(url=cfg['news_module_link'], dont_filter=True,
+                             meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
+                             callback=self.parse_news_list)
+
+    def parse_news_list(self, response):
+        """
+        解析新闻列表页
+        """
+        req_meta = response.meta
+        s_cfg = req_meta['s_cfg']
+        current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
+        list_selector = req_meta.get('list_selector')
+        nodes = response.xpath(list_selector['list_s'])
+        news_label_s = list_selector.get('label_s', None)
+        if not nodes:
+            self.logger.warning("""
+            当前采集模块：%s
+            没有采集到新闻链接
+            资源页链接：%s""" % (current_module, req_meta['s_cfg']['news_module_link']))
+        else:
+            self.logger.info("""
+            当前采集模块：%s
+            没有采集到新闻链接 %s 个
+            资源页链接：%s""" % (current_module, len(nodes), req_meta['s_cfg']['news_module_link']))
+        for node in nodes:
+            list_data = dict(
+                title=node.xpath(list_selector['title_s']).get(),
+                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
+                pub_link=list_selector['news_link_s'] and response.urljoin(
+                    node.xpath(list_selector['news_link_s']).get()),
+                news_label=news_label_s and node.xpath(news_label_s).get()
+            )
+            if not list_data['pub_link']:
+                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
+                continue
+            yield response.follow(list_data['pub_link'],
+                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
+                                            detail_selector=req_meta['detail_selector']),
+                                  callback=self.parse_news_detail)
+
+    def parse_news_detail(self, response):
+        req_meta = response.meta
+        s_cfg = req_meta.get('s_cfg')
+        detail_selector = req_meta.get('detail_selector')
+        list_data = req_meta.get('list_data', {})
+        contents = response.xpath(detail_selector['content_s']).getall()
+        text = '\n'.join([s.strip() for s in contents])
+        news_item = OrgNewsDatabaseItem()
+        news_item['title'] = list_data.get('title')
+        news_item['pub_time'] = list_data.get('pub_time')
+        news_item['news_link'] = list_data.get('pub_link')
+        news_item['news_label'] = list_data.get('news_label', None)
+        news_item['news_content'] = text
+        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
+        news_item['spider_cls'] = s_cfg['spider_cls']
+        news_item['updated_at'] = datetime.now()
+        yield news_item
diff --git a/org_news/org_news/spiders/org_news_fudan_lib.py b/org_news/org_news/spiders/org_news_fudan_lib.py
new file mode 100644
index 0000000..3c88759
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_fudan_lib.py
@@ -0,0 +1,66 @@
+import scrapy
+from org_news.items import OrgNewsDatabaseItem
+
+
+class OrgNewsFudanLibSpider(scrapy.Spider):
+    name = "org_news_fudan_lib"
+    allowed_domains = ["www.fudan.edu.cn"]
+    start_urls = ["https://www.fudan.edu.cn/"]
+    cfgs = [
+        dict(org_name='复旦大学',
+             org_domain='https://www.fudan.edu.cn/',
+             second_org_name='图书馆',
+             second_org_domain='https://library.fudan.edu.cn/',
+             news_module='资源动态',
+             news_module_link='https://library.fudan.edu.cn/zydtx/list.htm')
+    ]
+
+    def start_requests(self):
+        yield scrapy.FormRequest(url="https://library.fudan.edu.cn/_wp3services/generalQuery?queryObj=articles",
+                                 formdata=dict(
+                                     siteId=928, rows=14, columnId=42893, pageIndex=1,
+                                     returnInfos='[{"field":"title","name":"title"},{"field":"modifyTime","pattern":[{"name":"d","value":"yyyy-MM-dd"}],"name":"modifyTime"}]',
+                                     conditions='[]', orders='[{"field":"modifyTime","type":"desc"}]'
+                                 ))
+
+    def parse_news_list(self, response):
+        """
+        解析新闻列表页
+        """
+        req_meta = response.meta
+        list_selector = req_meta.get('list_selector')
+        nodes = response.xpath(list_selector['list_s'])
+        news_label_s = list_selector.get('label_s', None)
+        for node in nodes:
+            list_data = dict(
+                title=node.xpath(list_selector['title_s']).get(),
+                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
+                pub_link=list_selector['news_link_s'] and response.urljoin(
+                    node.xpath(list_selector['news_link_s']).get()),
+                news_label=news_label_s and node.xpath(news_label_s).get()
+            )
+            if not list_data['pub_link']:
+                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
+                continue
+            yield response.follow(list_data['pub_link'],
+                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
+                                            detail_selector=req_meta['detail_selector']),
+                                  callback=self.parse_news_detail)
+
+    def parse_news_detail(self, response):
+        req_meta = response.meta
+        s_cfg = req_meta.get('s_cfg')
+        detail_selector = req_meta.get('detail_selector')
+        list_data = req_meta.get('list_data', {})
+        contents = response.xpath(detail_selector['content_s']).getall()
+        text = '\n'.join([s.strip() for s in contents])
+        news_item = OrgNewsDatabaseItem()
+        news_item['title'] = list_data.get('title')
+        news_item['pub_time'] = list_data.get('pub_time')
+        news_item['news_link'] = list_data.get('pub_link')
+        news_item['news_label'] = list_data.get('news_label', None)
+        news_item['news_content'] = text
+        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
+        news_item['spider_cls'] = s_cfg['spider_cls']
+        news_item['updated_at'] = datetime.now()
+        yield news_item
diff --git a/org_news/org_news/spiders/org_news_hust_lib.py b/org_news/org_news/spiders/org_news_hust_lib.py
new file mode 100644
index 0000000..8580ca3
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_hust_lib.py
@@ -0,0 +1,10 @@
+import scrapy
+
+
+class OrgNewsHustLibSpider(scrapy.Spider):
+    name = "org_news_hust_lib"
+    allowed_domains = ["www.hust.edu.cn"]
+    start_urls = ["https://www.hust.edu.cn/"]
+
+    def parse(self, response):
+        pass
diff --git a/org_news/org_news/spiders/org_news_lib_browser.py b/org_news/org_news/spiders/org_news_lib_browser.py
new file mode 100644
index 0000000..ea53e88
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_lib_browser.py
@@ -0,0 +1,10 @@
+import scrapy
+
+
+class OrgNewsLibBrowserSpider(scrapy.Spider):
+    name = "org_news_lib_browser"
+    allowed_domains = ["lib.edu.cn"]
+    start_urls = ["https://lib.edu.cn"]
+
+    def parse(self, response):
+        pass
diff --git a/org_news/org_news/spiders/org_news_lib_database.py b/org_news/org_news/spiders/org_news_lib_database.py
new file mode 100644
index 0000000..e0ec7e3
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_lib_database.py
@@ -0,0 +1,76 @@
+import logging
+import json
+from datetime import datetime
+import scrapy
+from org_news.items import OrgNewsDatabaseItem
+from org_news.utils.read_cfg import read_cfg, format_cfg
+
+
+class OrgNewsLibDatabaseSpider(scrapy.Spider):
+    name = "org_news_lib_database"
+    # allowed_domains = ["lib.edu.cn"]
+    # start_urls = ["https://lib.edu.cn"]
+
+    def start_requests(self):
+        for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
+            if cfg['spider_cls'] != self.name: continue
+            cfg = format_cfg(cfg)
+            list_selector = cfg.pop('list_selector')
+            detail_selector = cfg.pop('detail_selector')
+            yield scrapy.Request(url=cfg['news_module_link'],
+                                 meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
+                                 callback=self.parse_news_list)
+
+    def parse_news_list(self, response):
+        """
+        解析新闻列表页
+        """
+        req_meta = response.meta
+        s_cfg = req_meta['cfg']
+        current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
+        list_selector = req_meta.get('list_selector')
+        nodes = response.xpath(list_selector['list_s'])
+        news_label_s = list_selector.get('label_s', None)
+        if not nodes:
+            self.logger.warning("""
+            当前采集模块：%s
+            没有采集到新闻链接
+            资源页链接：%s""" % (current_module, req_meta['cfg']['news_module_link']))
+        else:
+            self.logger.info("""
+            当前采集模块：%s
+            没有采集到新闻链接 %s 个
+            资源页链接：%s""" % (current_module, len(nodes), req_meta['cfg']['news_module_link']))
+        for node in nodes:
+            list_data = dict(
+                title=node.xpath(list_selector['title_s']).get(),
+                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
+                pub_link=list_selector['news_link_s'] and response.urljoin(
+                    node.xpath(list_selector['news_link_s']).get()),
+                news_label=news_label_s and node.xpath(news_label_s).get()
+            )
+            if not list_data['pub_link']:
+                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
+                continue
+            yield response.follow(list_data['pub_link'],
+                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
+                                            detail_selector=req_meta['detail_selector']),
+                                  callback=self.parse_news_detail)
+
+    def parse_news_detail(self, response):
+        req_meta = response.meta
+        s_cfg = req_meta.get('s_cfg')
+        detail_selector = req_meta.get('detail_selector')
+        list_data = req_meta.get('list_data', {})
+        contents = response.xpath(detail_selector['content_s']).getall()
+        text = '\n'.join([s.strip() for s in contents])
+        news_item = OrgNewsDatabaseItem()
+        news_item['title'] = list_data.get('title')
+        news_item['pub_time'] = list_data.get('pub_time')
+        news_item['news_link'] = list_data.get('pub_link')
+        news_item['news_label'] = list_data.get('news_label', None)
+        news_item['news_content'] = text
+        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
+        news_item['spider_cls'] = s_cfg['spider_cls']
+        news_item['updated_at'] = datetime.now()
+        yield news_item
diff --git a/org_news/org_news/spiders/org_news_lib_test.py b/org_news/org_news/spiders/org_news_lib_test.py
new file mode 100644
index 0000000..3f6bf41
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_lib_test.py
@@ -0,0 +1,19 @@
+import scrapy
+
+from org_news.utils.read_cfg import read_cfg
+from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
+
+
+class OrgNewsLibTestSpider(OrgNewsLibDatabaseSpider):
+    name = "org_news_lib_test"
+
+    def start_requests(self):
+        for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
+            # if cfg['spider_cls'] != self.name: continue
+            list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None),
+                                 title_s=cfg.pop('title_s'),
+                                 datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
+            detail_selector = dict(content_s=cfg.pop('content_s'))
+            yield scrapy.Request(url=cfg['news_module_link'],
+                                 meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
+                                 callback=self.parse_news_list)
diff --git a/org_news/org_news/spiders/org_news_sjtu_lib.py b/org_news/org_news/spiders/org_news_sjtu_lib.py
new file mode 100644
index 0000000..ffe5afc
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_sjtu_lib.py
@@ -0,0 +1,39 @@
+
+import logging
+import scrapy
+from scrapy_redis.spiders import RedisSpider
+
+from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
+from org_news.utils.read_cfg import read_cfg
+from org_news.utils import tools
+
+
+class OrgNewsSjtuLibSpider(OrgNewsLibDatabaseSpider):
+    name = "org_news_sjtu_lib"
+    # allowed_domains = ["www.lib.sjtu.edu.cn"]
+    # start_urls = ["https://www.lib.sjtu.edu.cn/"]
+
+    def parse_news_list(self, response):
+        """
+        解析新闻列表页
+        """
+        req_meta = response.meta
+        list_selector = req_meta.get('list_selector')
+        nodes = response.xpath(list_selector['list_s'])
+        news_label_s = list_selector.get('label_s', None)
+        for node in nodes:
+            list_data = dict(
+                title=node.xpath(list_selector['title_s']).get(),
+                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
+                pub_link=list_selector['news_link_s'] and response.urljoin(
+                    node.xpath(list_selector['news_link_s']).get()),
+                news_label=news_label_s and node.xpath(news_label_s).get()
+            )
+            if not list_data['pub_link']:
+                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
+                continue
+            params = tools.url_parse(list_data['pub_link'])
+            yield response.follow('https://www.lib.sjtu.edu.cn/f/content/content.shtml?id=%s' % params.get('id'),
+                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
+                                            detail_selector=req_meta['detail_selector']),
+                                  callback=self.parse_news_detail)
diff --git a/org_news/org_news/spiders/org_news_whu_lib.py b/org_news/org_news/spiders/org_news_whu_lib.py
new file mode 100644
index 0000000..d6e74e5
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_whu_lib.py
@@ -0,0 +1,10 @@
+import scrapy
+
+
+class OrgNewsWhuLibSpider(scrapy.Spider):
+    name = "org_news_whu_lib"
+    allowed_domains = ["www.whu.edu.cn"]
+    start_urls = ["https://www.whu.edu.cn/"]
+
+    def parse(self, response):
+        pass
diff --git a/org_news/org_news/spiders/org_news_xjtu_lib.py b/org_news/org_news/spiders/org_news_xjtu_lib.py
new file mode 100644
index 0000000..39e8156
--- /dev/null
+++ b/org_news/org_news/spiders/org_news_xjtu_lib.py
@@ -0,0 +1,113 @@
+import logging
+import json
+from datetime import datetime
+from urllib.parse import urlparse
+
+import scrapy
+from parsel.selector import Selector
+from org_news.items import OrgNewsDatabaseItem
+from org_news.utils.read_cfg import read_cfg, format_cfg
+from org_news.utils import tools
+
+FIELD_KEYS = [str(i) for i in range(7)]
+engineInstanceId = '361785'
+
+
+def find_value_by_key(key: str, obj: dict):
+    for idx in FIELD_KEYS:
+        x = obj[idx]
+        if isinstance(x, dict) and x.get('key') == key:
+            return x.get('value')
+
+
+class OrgNewsXjtuLibSpider(scrapy.Spider):
+    name = "org_news_xjtu_lib"
+    allowed_domains = ["www.lib.xjtu.edu.cn"]
+    request_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{module_id}/type/more-datas'
+    detail_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{news_id}/detail'
+    start_urls = ["https://www.xjtu.edu.cn/"]
+
+    def start_requests(self):
+        for cfg in read_cfg(
+                'D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
+            if cfg['spider_cls'] != self.name: continue
+            cfg = format_cfg(cfg)
+            list_selector = cfg.pop('list_selector')
+            detail_selector = cfg.pop('detail_selector')
+            path = urlparse(cfg['news_module_link']).path
+            module_id = path.split('/')[2]
+            params = tools.url_parse(cfg['news_module_link'])
+            yield scrapy.FormRequest(self.request_api.format(module_id=module_id), method="POST", dont_filter=True,
+                                     formdata=dict(engineInstanceId=engineInstanceId, pageNum='1', pageSize='20',
+                                                   typeId=params.get('typeId'), topTypeId='', sw=''),
+                                     meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector,
+                                               websiteId=params.get('websiteId')),
+                                     callback=self.parse_news_list)
+
+    def parse_news_list(self, response):
+        """
+        解析新闻列表页
+        """
+        req_meta = response.meta
+        websiteId = req_meta.get('websiteId')
+        if response.status != 200:
+            self.log('响应状态码异常')
+            return
+        resp_text = response.text
+        resp_json = json.loads(resp_text)
+        if resp_json.get('status') != 200 or resp_json.get('message') != "请求正确响应":
+            self.log('响应内容异常')
+            return
+        nodes = resp_json.get('data', {}).get('datas', {}).get('datas')
+
+        for node in nodes:
+            list_data = dict(
+                title=find_value_by_key('标题', node),
+                pub_time=find_value_by_key('时间', node),
+                pub_link=node.get('url') and response.urljoin(node.get('url')),
+                news_label=None
+            )
+            if not list_data['pub_link']:
+                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
+                continue
+            payload = f'engineInstanceId={node.get("engineInstanceId", engineInstanceId)}&typeId={node.get("typeId")}&pageId=1&websiteId={websiteId}&currentBranch=0'
+            list_data['pub_link'] = pub_link = self.detail_api.format(news_id=node.get('id')) + '?' + payload
+            # self.logger.debug('publink: %s' % pub_link)
+            yield scrapy.Request(url=pub_link,
+                                 meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
+                                           detail_selector=req_meta['detail_selector']),
+                                 callback=self.parse_news_detail)
+
+    def parse_news_detail(self, response):
+        req_meta = response.meta
+        s_cfg = req_meta.get('s_cfg')
+        list_data = req_meta.get('list_data', {})
+
+        contents = None
+        last_script = response.xpath('/html/script[last()]/text()')
+        if last_script:
+            """
+            import re
+            re.findall(r'data: (\{"engineInstanceId".*?\}),\r\n', last_script, re.S | re.M)
+            """
+            data_text = last_script.re_first(r'data: (\{"engineInstanceId".*?\}),\r\n')
+            try:
+                data_dic = json.loads(data_text)
+                selector = Selector(data_dic.get('content', ''), type='html')
+            except json.decoder.JSONDecodeError:
+                contents = None
+            else:
+                contents = selector.xpath('string(.)').get(None)
+        if not contents:
+            self.logger.warning("没有提取到数据")
+
+        news_item = OrgNewsDatabaseItem()
+        news_item['title'] = list_data.get('title')
+        news_item['pub_time'] = list_data.get('pub_time')
+        news_item['news_link'] = list_data.get('pub_link')
+        news_item['news_label'] = list_data.get('news_label', None)
+        news_item['news_content'] = contents
+        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
+        news_item['spider_cls'] = s_cfg['spider_cls']
+        news_item['updated_at'] = datetime.now()
+        yield news_item
diff --git a/org_news/org_news/spiders/seu_lib_resource_dynamics.py b/org_news/org_news/spiders/seu_lib_resource_dynamics.py
new file mode 100644
index 0000000..540dc44
--- /dev/null
+++ b/org_news/org_news/spiders/seu_lib_resource_dynamics.py
@@ -0,0 +1,27 @@
+import scrapy
+
+from org_news.items import OrgNewsItem
+
+
+class SeuLibResourceDynamicsSpider(scrapy.Spider):
+    name = "seu_lib_resource_dynamics"
+    allowed_domains = ["lib.seu.edu.cn"]
+    start_urls = ["https://lib.seu.edu.cn/list.php?fid=264&page=1"]
+
+    def parse(self, response):
+        nodes = response.xpath('//div[@class="content-right-list"]/ul/li[@class="list-item"]')
+        for node in nodes:
+            list_data = dict(
+                title=node.xpath('./a/span/text()').get(),
+                pub_time=node.xpath('./span[@class="item-time"]/text()').get(),
+                pub_link=response.urljoin(node.xpath('./a/@href').get()),
+            )
+            yield response.follow(list_data['pub_link'], callback=self.parse_news_detail, meta=dict(list_data=list_data))
+
+    def parse_news_detail(self, response):
+        req_meta = response.meta
+        list_data = req_meta.get('list_data', {})
+        contents = response.xpath('string(//div[@class="article-wrap"])').getall()
+        text = '\n'.join([s.strip() for s in contents])
+        print(text)
+
diff --git a/org_news/org_news/utils/__init__.py b/org_news/org_news/utils/__init__.py
new file mode 100644
index 0000000..0bc4c99
--- /dev/null
+++ b/org_news/org_news/utils/__init__.py
@@ -0,0 +1,4 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/24 14:42
+# @Author  : zhaoxiangpeng
+# @File    : __init__.py.py
diff --git a/org_news/org_news/utils/read_cfg.py b/org_news/org_news/utils/read_cfg.py
new file mode 100644
index 0000000..a9d8bd4
--- /dev/null
+++ b/org_news/org_news/utils/read_cfg.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/23 15:02
+# @Author  : zhaoxiangpeng
+# @File    : read_cfg.py
+
+import json
+
+
+def read_cfg(file):
+    with open(file, 'r', encoding='utf-8') as f:
+        while line := f.readline():
+            yield json.loads(line)
+
+
+def format_cfg(cfg: dict):
+    list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), title_s=cfg.pop('title_s'),
+                         datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
+    detail_selector = dict(content_s=cfg.pop('content_s'))
+    new_cfg = cfg.copy()
+    new_cfg['list_selector'] = list_selector
+    new_cfg['detail_selector'] = detail_selector
+    return new_cfg
+
+
+def test_read_cfg():
+    for cfg in read_cfg('/scrapy-demo1/org_news/org_news/selector_cfg/library_selector.txt'):
+        print(cfg)
+        print(format_cfg(cfg))
+
+
+if __name__ == '__main__':
+    test_read_cfg()
diff --git a/org_news/org_news/utils/tools.py b/org_news/org_news/utils/tools.py
new file mode 100644
index 0000000..52336c5
--- /dev/null
+++ b/org_news/org_news/utils/tools.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/24 14:42
+# @Author  : zhaoxiangpeng
+# @File    : tools.py
+
+from urllib.parse import urlparse, parse_qs
+
+
+def url_parse(url: str):
+    """
+    url解析为dict
+    :param url:
+    :return:
+    """
+    query = urlparse(url).query
+    params = parse_qs(query)
+    result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params}
+    return result
diff --git a/org_news/run.py b/org_news/run.py
new file mode 100644
index 0000000..f70555a
--- /dev/null
+++ b/org_news/run.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/7/25 13:39
+# @Author  : zhaoxiangpeng
+# @File    : run.py
+import os
+import re
+from scrapy.crawler import CrawlerProcess
+from scrapy.settings import Settings
+from scrapy.utils.project import get_project_settings
+regex = re.compile(r'^(org_news_).*?(_lib)$')
+
+
+def load_spider_script(path):
+    """
+    加载特定的爬虫脚本
+    """
+    scripts = os.listdir(path)
+    spiders = []
+    for script in scripts:
+        if not script.endswith('.py'):
+            continue
+        spider_name = script[:-3]
+        if bool(regex.search(spider_name)):
+            spiders.append(spider_name)
+    return spiders
+
+
+process = CrawlerProcess(get_project_settings())
+
+# process.crawl('org_news_sjtu_lib')
+process.crawl('org_news_xjtu_lib')
+process.start()
diff --git a/org_news/scrapy.cfg b/org_news/scrapy.cfg
new file mode 100644
index 0000000..ab00335
--- /dev/null
+++ b/org_news/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = org_news.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = org_news