add:org_news commit to kc

3 months ago · 25ffff7604
parent 2e0c44e983
commit 25ffff7604
27 changed files with 1013 additions and 0 deletions
--- a/org_news/org_news/init.py
+++ b/org_news/org_news/init.py
--- a/org_news/org_news/items.py
+++ b/org_news/org_news/items.py
@ -0,0 +1,25 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class OrgNewsItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    pub_time = scrapy.Field()
    news_link = scrapy.Field()
    updated_at = scrapy.Field()
 class OrgNewsDatabaseItem(OrgNewsItem):
    title = scrapy.Field()
    news_label = scrapy.Field()
    news_content = scrapy.Field()
    tags = scrapy.Field()
    news_source = scrapy.Field()
    spider_cls = scrapy.Field()
--- a/org_news/org_news/main.py
+++ b/org_news/org_news/main.py
@ -0,0 +1,11 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/21 17:29
 # @Author  : zhaoxiangpeng
 # @File    : main.py
 from scrapy.cmdline import execute
 execute('scrapy crawl org_news_distributed'.split())
 # execute('scrapy crawl org_news_lib_test'.split())
 # execute('scrapy crawl org_news_sjtu_lib'.split())
--- a/org_news/org_news/middlewares.py
+++ b/org_news/org_news/middlewares.py
@ -0,0 +1,100 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 class OrgNewsSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    async def process_start(self, start):
        # Called with an async iterator over the spider start() method or the
        # maching method of an earlier spider middleware.
        async for item_or_request in start:
            yield item_or_request
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class OrgNewsDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/org_news/org_news/pipelines.py
+++ b/org_news/org_news/pipelines.py
@ -0,0 +1,67 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 # useful for handling different item types with a single interface
 import re
 import pymongo
 from itemadapter import ItemAdapter
 class OrgNewsPipeline:
    def process_item(self, item, spider):
        return item
 class NewsTitleClassifyPipeline:
    __KEYWORDS__ = dict(
        Database=['开通', '试用', '停订', '新增', '时长'],
        HumanAffairs=['现在馆长', '馆长更换']
    )
    keyword_db_pattern = re.compile('|'.join(__KEYWORDS__['Database']))
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        news_title = adapter.get("title")
        tags1 = self.keyword_db_pattern.findall(news_title)
        item['tags'] = tags1
        return item
 class NewsStandardPipeline:
    content_standard_pattern = re.compile(r'[\r\n\s]')
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        news_content = adapter.get("news_content")
        item['news_content'] = self.content_standard_pattern.sub('', news_content)
        return item
 class MongoPipeline:
    collection_name = "data_org_news"
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get("MONGO_URI"),
            mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
        )
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
    def close_spider(self, spider):
        self.client.close()
    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
        return item
--- a/org_news/org_news/scripts/create_table.py
+++ b/org_news/org_news/scripts/create_table.py
@ -0,0 +1,33 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/24 9:23
 # @Author  : zhaoxiangpeng
 # @File    : create_table.py
 import pymongo
 from pymongo import MongoClient
 from pymongo.database import Database
 def mongo():
    client = MongoClient("mongodb://root:123456@192.168.1.211:27017/")
    db = client['science2']
    return db
 def create_news_collection(db: Database):
    collection = db.get_collection('data_org_news')
    # collection.create_index(
    #     keys=[]
    # )
    collection.create_index(
        keys=[
            ('news_source', pymongo.ASCENDING),
        ],
        background=True,
        name='_news_s'
    )  # 创建来源索引，这个字段是list
 if __name__ == '__main__':
    mongo_db = mongo()
    create_news_collection(mongo_db)
--- a/org_news/org_news/scripts/export_data.py
+++ b/org_news/org_news/scripts/export_data.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/24 9:23
 # @Author  : zhaoxiangpeng
 # @File    : export_data.py
--- a/org_news/org_news/scripts/task_push.py
+++ b/org_news/org_news/scripts/task_push.py
@ -0,0 +1,26 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/24 11:06
 # @Author  : zhaoxiangpeng
 # @File    : task_push.py
 import json
 import redis
 import org_news.settings as settings
 from org_news.utils.read_cfg import read_cfg, format_cfg
 def do_test():
    r = redis.StrictRedis.from_url(settings.REDIS_URL)
    for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
        r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
 def main():
    r = redis.StrictRedis.from_url(settings.REDIS_URL)
    for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
        r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
 if __name__ == '__main__':
    main()
--- a/org_news/org_news/selector_cfg/a_insert_xpath.py
+++ b/org_news/org_news/selector_cfg/a_insert_xpath.py
@ -0,0 +1,61 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/23 11:27
 # @Author  : zhaoxiangpeng
 # @File    : a_insert_xpath.py
 import json
 from pprint import pprint
 class CfgTemplate:
    org_name = None
    org_domain = None
    second_org_name = None
    second_org_domain = None
    news_module = None
    news_module_link = None
    list_s = None
    title_s = None
    datetime_s = None
    news_link_s = None
    label_s = None
    content_s = None
    spider_cls = None
    invalid = None
 def insert2testCfgFile(s_cfg: dict):
    pprint(s_cfg)
    with open('library_selector_test.txt', 'w+', encoding='utf-8') as f:
        f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
 def insert2cfgFile(s_cfg: dict):
    pprint(s_cfg)
    with open('library_selector.txt', 'a+', encoding='utf-8') as f:
        f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
 def main():
    module = CfgTemplate()
    module.__setattr__('org_name', '西安交通大学')  # 机构名称
    module.__setattr__('org_domain', 'https://www.xjtu.edu.cn/')  # 机构域名
    module.__setattr__('second_org_name', '图书馆')  # 二级机构名称
    module.__setattr__('second_org_domain', 'http://www.lib.xjtu.edu.cn/')  # 二级机构域名
    module.__setattr__('news_module', '通知公告')  # 新闻所在模块名称
    module.__setattr__('news_module_link', 'http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362')  # 新闻所在模块链接
    module.__setattr__('list_s', None)  # 新闻列表选择器
    module.__setattr__('title_s', None)  # 新闻标题选择器
    module.__setattr__('news_link_s', None)  # 新闻链接选择器
    module.__setattr__('datetime_s', None)  # 发布日期时间选择器
    module.__setattr__('label_s', None)  # 新闻标签选择器
    module.__setattr__('content_s', None)  # 正文内容选择器
    module.__setattr__('spider_cls', 'org_news_lib_database')  # 爬虫类
    # module.__setattr__('invalid', 1)  # 爬虫类
    print(module)
    insert2testCfgFile(module.__dict__)
    insert2cfgFile(module.__dict__)
 if __name__ == '__main__':
    main()
--- a/org_news/org_news/selector_cfg/library_selector.txt
+++ b/org_news/org_news/selector_cfg/library_selector.txt
@ -0,0 +1,22 @@
 {"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "新闻资讯", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=263", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=264", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "通知公告", "news_module_link": "https://lib.tsinghua.edu.cn/tzgg.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.tsinghua.edu.cn/zydt.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "活动日历", "news_module_link": "https://lib.tsinghua.edu.cn/hdrl.htm", "list_s": "//div[@id=\"kzyc2\"]//div[@class=\"rl-box\"]/ul//li", "title_s": "./div[contains(@class, \"rl-title\")]/a/text()", "news_link_s": "./div[contains(@class, \"rl-title\")]/a/@href", "datetime_s": "./div[@class=\"rl-label\"]/span[@class=\"rl-date\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"library-content-content\"]/div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "新闻", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/261xwlb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]/div[@class=\"sub_0261 ul-inline\"]/ul/li", "title_s": "./a/div/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
 {"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "公告", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/262gglb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
 {"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "资源", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/gm/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
 {"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "活动", "news_module_link": "https://www.lib.pku.edu.cn/4whjy/45hdzljz/453qb/index.htm", "list_s": "//div[@class=\"wrap_sub0435\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./div/div/div[@class=\"bottom\"]/a/text()", "news_link_s": "./div/div/div[@class=\"bottom\"]/a/@href", "datetime_s": "./div/div/div[@class=\"bottom\"]/div[@class=\"info\"]/p/i[contains(@class, \"icon-shijianfuxing\")]/../text()", "label_s": "./div/div/a[contains(@class, \"type\")]/text()", "content_s": "string(//div[@class=\"content\"]/div/div[@class=\"wrap_sub_0262\"])", "spider_cls": "org_news_lib_browser", "invalid": 1}
 {"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "通知公告", "news_module_link": "https://libweb.zju.edu.cn/39478/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://libweb.zju.edu.cn/55543/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "本馆新闻", "news_module_link": "https://libweb.zju.edu.cn/55989/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "新闻通告", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=3&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
 {"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "资源动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=4&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
 {"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "融媒体动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=26&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": null, "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
 {"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.nju.edu.cn/dzzy/zydt1.htm", "list_s": "//div[@class=\"zydt\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"time\"]/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "新闻通知", "news_module_link": "https://lib.nju.edu.cn/xw/xwtz.htm", "list_s": "//div[@class=\"gqzx-list\"]/ul/li", "title_s": "./a/text()", "news_link_s": "./a/@href", "datetime_s": "./span/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/资源动态/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p/text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "服务公告", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/服务公告/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "讲座培训", "news_module_link": "https://lib.ustc.edu.cn/category/讲座培训/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
 {"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "资源信息", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493363", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
 {"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "通知公告", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
--- a/org_news/org_news/settings.py
+++ b/org_news/org_news/settings.py
@ -0,0 +1,101 @@
 # Scrapy settings for org_news project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "org_news"
 SPIDER_MODULES = ["org_news.spiders"]
 NEWSPIDER_MODULE = "org_news.spiders"
 ADDONS = {}
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = "org_news (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 # Concurrency and throttling settings
 #CONCURRENT_REQUESTS = 16
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
 DOWNLOAD_DELAY = 1
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "en",
   "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
 }
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    "org_news.middlewares.OrgNewsSpiderMiddleware": 543,
 #}
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 RETRY_ENABLED = True
 RETRY_TIMES = 2  # 重试3次
 RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404]  # 增加了一些常见的错误码
 DOWNLOADER_MIDDLEWARES = {
   'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
   # "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
 }
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 #}
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
   "org_news.pipelines.OrgNewsPipeline": 300,
   "org_news.pipelines.NewsTitleClassifyPipeline": 400,
   "org_news.pipelines.NewsStandardPipeline": 410,
   # "org_news.pipelines.MongoPipeline": 500,
 }
 MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
 MONGO_DATABASE = "science2"
 REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = "httpcache"
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
--- a/org_news/org_news/spiders/init.py
+++ b/org_news/org_news/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/org_news/org_news/spiders/org_news_distributed.py
+++ b/org_news/org_news/spiders/org_news_distributed.py
@ -0,0 +1,92 @@
 import logging
 import json
 from datetime import datetime
 import scrapy
 from scrapy_redis.spiders import RedisSpider
 from scrapy_redis.utils import bytes_to_str
 from org_news.items import OrgNewsDatabaseItem
 class OrgNewsDistributedSpider(RedisSpider):
    name = "org_news_distributed"
    custom_settings = dict(  # 修改调度器
        SCHEDULER="scrapy_redis.scheduler.Scheduler",
        # 修改去重工具
        DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter",
        # 开启数据持久化
        SCHEDULER_PERSIST=True,
        # REDIS_HOST='192.168.1.211',
        # REDIS_PORT=6379,
        # # 验证数据库密码
        # REDIS_PARAMS={
        #     'password': 'kcidea1509',
        # },
    )
    def make_request_from_data(self, data):
        formatted_data = bytes_to_str(data, self.redis_encoding)
        cfg = json.loads(formatted_data)
        if cfg['spider_cls'] != 'org_news_lib_database':
            self.logger.warning('不需要 %s 消费，跳过\n配置为：%s' % (self.name, cfg))
            return []
        list_selector = cfg.pop('list_selector')
        detail_selector = cfg.pop('detail_selector')
        yield scrapy.Request(url=cfg['news_module_link'], dont_filter=True,
                             meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
                             callback=self.parse_news_list)
    def parse_news_list(self, response):
        """
        解析新闻列表页
        """
        req_meta = response.meta
        s_cfg = req_meta['s_cfg']
        current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
        list_selector = req_meta.get('list_selector')
        nodes = response.xpath(list_selector['list_s'])
        news_label_s = list_selector.get('label_s', None)
        if not nodes:
            self.logger.warning("""
            当前采集模块：%s
            没有采集到新闻链接
            资源页链接：%s""" % (current_module, req_meta['s_cfg']['news_module_link']))
        else:
            self.logger.info("""
            当前采集模块：%s
            没有采集到新闻链接 %s 个
            资源页链接：%s""" % (current_module, len(nodes), req_meta['s_cfg']['news_module_link']))
        for node in nodes:
            list_data = dict(
                title=node.xpath(list_selector['title_s']).get(),
                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
                pub_link=list_selector['news_link_s'] and response.urljoin(
                    node.xpath(list_selector['news_link_s']).get()),
                news_label=news_label_s and node.xpath(news_label_s).get()
            )
            if not list_data['pub_link']:
                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
                continue
            yield response.follow(list_data['pub_link'],
                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
                                            detail_selector=req_meta['detail_selector']),
                                  callback=self.parse_news_detail)
    def parse_news_detail(self, response):
        req_meta = response.meta
        s_cfg = req_meta.get('s_cfg')
        detail_selector = req_meta.get('detail_selector')
        list_data = req_meta.get('list_data', {})
        contents = response.xpath(detail_selector['content_s']).getall()
        text = '\n'.join([s.strip() for s in contents])
        news_item = OrgNewsDatabaseItem()
        news_item['title'] = list_data.get('title')
        news_item['pub_time'] = list_data.get('pub_time')
        news_item['news_link'] = list_data.get('pub_link')
        news_item['news_label'] = list_data.get('news_label', None)
        news_item['news_content'] = text
        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
        news_item['spider_cls'] = s_cfg['spider_cls']
        news_item['updated_at'] = datetime.now()
        yield news_item
--- a/org_news/org_news/spiders/org_news_fudan_lib.py
+++ b/org_news/org_news/spiders/org_news_fudan_lib.py
@ -0,0 +1,66 @@
 import scrapy
 from org_news.items import OrgNewsDatabaseItem
 class OrgNewsFudanLibSpider(scrapy.Spider):
    name = "org_news_fudan_lib"
    allowed_domains = ["www.fudan.edu.cn"]
    start_urls = ["https://www.fudan.edu.cn/"]
    cfgs = [
        dict(org_name='复旦大学',
             org_domain='https://www.fudan.edu.cn/',
             second_org_name='图书馆',
             second_org_domain='https://library.fudan.edu.cn/',
             news_module='资源动态',
             news_module_link='https://library.fudan.edu.cn/zydtx/list.htm')
    ]
    def start_requests(self):
        yield scrapy.FormRequest(url="https://library.fudan.edu.cn/_wp3services/generalQuery?queryObj=articles",
                                 formdata=dict(
                                     siteId=928, rows=14, columnId=42893, pageIndex=1,
                                     returnInfos='[{"field":"title","name":"title"},{"field":"modifyTime","pattern":[{"name":"d","value":"yyyy-MM-dd"}],"name":"modifyTime"}]',
                                     conditions='[]', orders='[{"field":"modifyTime","type":"desc"}]'
                                 ))
    def parse_news_list(self, response):
        """
        解析新闻列表页
        """
        req_meta = response.meta
        list_selector = req_meta.get('list_selector')
        nodes = response.xpath(list_selector['list_s'])
        news_label_s = list_selector.get('label_s', None)
        for node in nodes:
            list_data = dict(
                title=node.xpath(list_selector['title_s']).get(),
                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
                pub_link=list_selector['news_link_s'] and response.urljoin(
                    node.xpath(list_selector['news_link_s']).get()),
                news_label=news_label_s and node.xpath(news_label_s).get()
            )
            if not list_data['pub_link']:
                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
                continue
            yield response.follow(list_data['pub_link'],
                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
                                            detail_selector=req_meta['detail_selector']),
                                  callback=self.parse_news_detail)
    def parse_news_detail(self, response):
        req_meta = response.meta
        s_cfg = req_meta.get('s_cfg')
        detail_selector = req_meta.get('detail_selector')
        list_data = req_meta.get('list_data', {})
        contents = response.xpath(detail_selector['content_s']).getall()
        text = '\n'.join([s.strip() for s in contents])
        news_item = OrgNewsDatabaseItem()
        news_item['title'] = list_data.get('title')
        news_item['pub_time'] = list_data.get('pub_time')
        news_item['news_link'] = list_data.get('pub_link')
        news_item['news_label'] = list_data.get('news_label', None)
        news_item['news_content'] = text
        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
        news_item['spider_cls'] = s_cfg['spider_cls']
        news_item['updated_at'] = datetime.now()
        yield news_item
--- a/org_news/org_news/spiders/org_news_hust_lib.py
+++ b/org_news/org_news/spiders/org_news_hust_lib.py
@ -0,0 +1,10 @@
 import scrapy
 class OrgNewsHustLibSpider(scrapy.Spider):
    name = "org_news_hust_lib"
    allowed_domains = ["www.hust.edu.cn"]
    start_urls = ["https://www.hust.edu.cn/"]
    def parse(self, response):
        pass
--- a/org_news/org_news/spiders/org_news_lib_browser.py
+++ b/org_news/org_news/spiders/org_news_lib_browser.py
@ -0,0 +1,10 @@
 import scrapy
 class OrgNewsLibBrowserSpider(scrapy.Spider):
    name = "org_news_lib_browser"
    allowed_domains = ["lib.edu.cn"]
    start_urls = ["https://lib.edu.cn"]
    def parse(self, response):
        pass
--- a/org_news/org_news/spiders/org_news_lib_database.py
+++ b/org_news/org_news/spiders/org_news_lib_database.py
@ -0,0 +1,76 @@
 import logging
 import json
 from datetime import datetime
 import scrapy
 from org_news.items import OrgNewsDatabaseItem
 from org_news.utils.read_cfg import read_cfg, format_cfg
 class OrgNewsLibDatabaseSpider(scrapy.Spider):
    name = "org_news_lib_database"
    # allowed_domains = ["lib.edu.cn"]
    # start_urls = ["https://lib.edu.cn"]
    def start_requests(self):
        for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
            if cfg['spider_cls'] != self.name: continue
            cfg = format_cfg(cfg)
            list_selector = cfg.pop('list_selector')
            detail_selector = cfg.pop('detail_selector')
            yield scrapy.Request(url=cfg['news_module_link'],
                                 meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
                                 callback=self.parse_news_list)
    def parse_news_list(self, response):
        """
        解析新闻列表页
        """
        req_meta = response.meta
        s_cfg = req_meta['cfg']
        current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']])
        list_selector = req_meta.get('list_selector')
        nodes = response.xpath(list_selector['list_s'])
        news_label_s = list_selector.get('label_s', None)
        if not nodes:
            self.logger.warning("""
            当前采集模块：%s
            没有采集到新闻链接
            资源页链接：%s""" % (current_module, req_meta['cfg']['news_module_link']))
        else:
            self.logger.info("""
            当前采集模块：%s
            没有采集到新闻链接 %s 个
            资源页链接：%s""" % (current_module, len(nodes), req_meta['cfg']['news_module_link']))
        for node in nodes:
            list_data = dict(
                title=node.xpath(list_selector['title_s']).get(),
                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
                pub_link=list_selector['news_link_s'] and response.urljoin(
                    node.xpath(list_selector['news_link_s']).get()),
                news_label=news_label_s and node.xpath(news_label_s).get()
            )
            if not list_data['pub_link']:
                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
                continue
            yield response.follow(list_data['pub_link'],
                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
                                            detail_selector=req_meta['detail_selector']),
                                  callback=self.parse_news_detail)
    def parse_news_detail(self, response):
        req_meta = response.meta
        s_cfg = req_meta.get('s_cfg')
        detail_selector = req_meta.get('detail_selector')
        list_data = req_meta.get('list_data', {})
        contents = response.xpath(detail_selector['content_s']).getall()
        text = '\n'.join([s.strip() for s in contents])
        news_item = OrgNewsDatabaseItem()
        news_item['title'] = list_data.get('title')
        news_item['pub_time'] = list_data.get('pub_time')
        news_item['news_link'] = list_data.get('pub_link')
        news_item['news_label'] = list_data.get('news_label', None)
        news_item['news_content'] = text
        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
        news_item['spider_cls'] = s_cfg['spider_cls']
        news_item['updated_at'] = datetime.now()
        yield news_item
--- a/org_news/org_news/spiders/org_news_lib_test.py
+++ b/org_news/org_news/spiders/org_news_lib_test.py
@ -0,0 +1,19 @@
 import scrapy
 from org_news.utils.read_cfg import read_cfg
 from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
 class OrgNewsLibTestSpider(OrgNewsLibDatabaseSpider):
    name = "org_news_lib_test"
    def start_requests(self):
        for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
            # if cfg['spider_cls'] != self.name: continue
            list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None),
                                 title_s=cfg.pop('title_s'),
                                 datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
            detail_selector = dict(content_s=cfg.pop('content_s'))
            yield scrapy.Request(url=cfg['news_module_link'],
                                 meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
                                 callback=self.parse_news_list)
--- a/org_news/org_news/spiders/org_news_sjtu_lib.py
+++ b/org_news/org_news/spiders/org_news_sjtu_lib.py
@ -0,0 +1,39 @@
 import logging
 import scrapy
 from scrapy_redis.spiders import RedisSpider
 from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
 from org_news.utils.read_cfg import read_cfg
 from org_news.utils import tools
 class OrgNewsSjtuLibSpider(OrgNewsLibDatabaseSpider):
    name = "org_news_sjtu_lib"
    # allowed_domains = ["www.lib.sjtu.edu.cn"]
    # start_urls = ["https://www.lib.sjtu.edu.cn/"]
    def parse_news_list(self, response):
        """
        解析新闻列表页
        """
        req_meta = response.meta
        list_selector = req_meta.get('list_selector')
        nodes = response.xpath(list_selector['list_s'])
        news_label_s = list_selector.get('label_s', None)
        for node in nodes:
            list_data = dict(
                title=node.xpath(list_selector['title_s']).get(),
                pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
                pub_link=list_selector['news_link_s'] and response.urljoin(
                    node.xpath(list_selector['news_link_s']).get()),
                news_label=news_label_s and node.xpath(news_label_s).get()
            )
            if not list_data['pub_link']:
                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
                continue
            params = tools.url_parse(list_data['pub_link'])
            yield response.follow('https://www.lib.sjtu.edu.cn/f/content/content.shtml?id=%s' % params.get('id'),
                                  meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
                                            detail_selector=req_meta['detail_selector']),
                                  callback=self.parse_news_detail)
--- a/org_news/org_news/spiders/org_news_whu_lib.py
+++ b/org_news/org_news/spiders/org_news_whu_lib.py
@ -0,0 +1,10 @@
 import scrapy
 class OrgNewsWhuLibSpider(scrapy.Spider):
    name = "org_news_whu_lib"
    allowed_domains = ["www.whu.edu.cn"]
    start_urls = ["https://www.whu.edu.cn/"]
    def parse(self, response):
        pass
--- a/org_news/org_news/spiders/org_news_xjtu_lib.py
+++ b/org_news/org_news/spiders/org_news_xjtu_lib.py
@ -0,0 +1,113 @@
 import logging
 import json
 from datetime import datetime
 from urllib.parse import urlparse
 import scrapy
 from parsel.selector import Selector
 from org_news.items import OrgNewsDatabaseItem
 from org_news.utils.read_cfg import read_cfg, format_cfg
 from org_news.utils import tools
 FIELD_KEYS = [str(i) for i in range(7)]
 engineInstanceId = '361785'
 def find_value_by_key(key: str, obj: dict):
    for idx in FIELD_KEYS:
        x = obj[idx]
        if isinstance(x, dict) and x.get('key') == key:
            return x.get('value')
 class OrgNewsXjtuLibSpider(scrapy.Spider):
    name = "org_news_xjtu_lib"
    allowed_domains = ["www.lib.xjtu.edu.cn"]
    request_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{module_id}/type/more-datas'
    detail_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{news_id}/detail'
    start_urls = ["https://www.xjtu.edu.cn/"]
    def start_requests(self):
        for cfg in read_cfg(
                'D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
            if cfg['spider_cls'] != self.name: continue
            cfg = format_cfg(cfg)
            list_selector = cfg.pop('list_selector')
            detail_selector = cfg.pop('detail_selector')
            path = urlparse(cfg['news_module_link']).path
            module_id = path.split('/')[2]
            params = tools.url_parse(cfg['news_module_link'])
            yield scrapy.FormRequest(self.request_api.format(module_id=module_id), method="POST", dont_filter=True,
                                     formdata=dict(engineInstanceId=engineInstanceId, pageNum='1', pageSize='20',
                                                   typeId=params.get('typeId'), topTypeId='', sw=''),
                                     meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector,
                                               websiteId=params.get('websiteId')),
                                     callback=self.parse_news_list)
    def parse_news_list(self, response):
        """
        解析新闻列表页
        """
        req_meta = response.meta
        websiteId = req_meta.get('websiteId')
        if response.status != 200:
            self.log('响应状态码异常')
            return
        resp_text = response.text
        resp_json = json.loads(resp_text)
        if resp_json.get('status') != 200 or resp_json.get('message') != "请求正确响应":
            self.log('响应内容异常')
            return
        nodes = resp_json.get('data', {}).get('datas', {}).get('datas')
        for node in nodes:
            list_data = dict(
                title=find_value_by_key('标题', node),
                pub_time=find_value_by_key('时间', node),
                pub_link=node.get('url') and response.urljoin(node.get('url')),
                news_label=None
            )
            if not list_data['pub_link']:
                self.log('没有找到link:     %s' % list_data['title'], level=logging.WARNING)
                continue
            payload = f'engineInstanceId={node.get("engineInstanceId", engineInstanceId)}&typeId={node.get("typeId")}&pageId=1&websiteId={websiteId}&currentBranch=0'
            list_data['pub_link'] = pub_link = self.detail_api.format(news_id=node.get('id')) + '?' + payload
            # self.logger.debug('publink: %s' % pub_link)
            yield scrapy.Request(url=pub_link,
                                 meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
                                           detail_selector=req_meta['detail_selector']),
                                 callback=self.parse_news_detail)
    def parse_news_detail(self, response):
        req_meta = response.meta
        s_cfg = req_meta.get('s_cfg')
        list_data = req_meta.get('list_data', {})
        contents = None
        last_script = response.xpath('/html/script[last()]/text()')
        if last_script:
            """
            import re
            re.findall(r'data: (\{"engineInstanceId".*?\}),\r\n', last_script, re.S | re.M)
            """
            data_text = last_script.re_first(r'data: (\{"engineInstanceId".*?\}),\r\n')
            try:
                data_dic = json.loads(data_text)
                selector = Selector(data_dic.get('content', ''), type='html')
            except json.decoder.JSONDecodeError:
                contents = None
            else:
                contents = selector.xpath('string(.)').get(None)
        if not contents:
            self.logger.warning("没有提取到数据")
        news_item = OrgNewsDatabaseItem()
        news_item['title'] = list_data.get('title')
        news_item['pub_time'] = list_data.get('pub_time')
        news_item['news_link'] = list_data.get('pub_link')
        news_item['news_label'] = list_data.get('news_label', None)
        news_item['news_content'] = contents
        news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
        news_item['spider_cls'] = s_cfg['spider_cls']
        news_item['updated_at'] = datetime.now()
        yield news_item
--- a/org_news/org_news/spiders/seu_lib_resource_dynamics.py
+++ b/org_news/org_news/spiders/seu_lib_resource_dynamics.py
@ -0,0 +1,27 @@
 import scrapy
 from org_news.items import OrgNewsItem
 class SeuLibResourceDynamicsSpider(scrapy.Spider):
    name = "seu_lib_resource_dynamics"
    allowed_domains = ["lib.seu.edu.cn"]
    start_urls = ["https://lib.seu.edu.cn/list.php?fid=264&page=1"]
    def parse(self, response):
        nodes = response.xpath('//div[@class="content-right-list"]/ul/li[@class="list-item"]')
        for node in nodes:
            list_data = dict(
                title=node.xpath('./a/span/text()').get(),
                pub_time=node.xpath('./span[@class="item-time"]/text()').get(),
                pub_link=response.urljoin(node.xpath('./a/@href').get()),
            )
            yield response.follow(list_data['pub_link'], callback=self.parse_news_detail, meta=dict(list_data=list_data))
    def parse_news_detail(self, response):
        req_meta = response.meta
        list_data = req_meta.get('list_data', {})
        contents = response.xpath('string(//div[@class="article-wrap"])').getall()
        text = '\n'.join([s.strip() for s in contents])
        print(text)
--- a/org_news/org_news/utils/init.py
+++ b/org_news/org_news/utils/init.py
@ -0,0 +1,4 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/24 14:42
 # @Author  : zhaoxiangpeng
 # @File    : __init__.py.py
--- a/org_news/org_news/utils/read_cfg.py
+++ b/org_news/org_news/utils/read_cfg.py
@ -0,0 +1,32 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/23 15:02
 # @Author  : zhaoxiangpeng
 # @File    : read_cfg.py
 import json
 def read_cfg(file):
    with open(file, 'r', encoding='utf-8') as f:
        while line := f.readline():
            yield json.loads(line)
 def format_cfg(cfg: dict):
    list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), title_s=cfg.pop('title_s'),
                         datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
    detail_selector = dict(content_s=cfg.pop('content_s'))
    new_cfg = cfg.copy()
    new_cfg['list_selector'] = list_selector
    new_cfg['detail_selector'] = detail_selector
    return new_cfg
 def test_read_cfg():
    for cfg in read_cfg('/scrapy-demo1/org_news/org_news/selector_cfg/library_selector.txt'):
        print(cfg)
        print(format_cfg(cfg))
 if __name__ == '__main__':
    test_read_cfg()
--- a/org_news/org_news/utils/tools.py
+++ b/org_news/org_news/utils/tools.py
@ -0,0 +1,18 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/24 14:42
 # @Author  : zhaoxiangpeng
 # @File    : tools.py
 from urllib.parse import urlparse, parse_qs
 def url_parse(url: str):
    """
    url解析为dict
    :param url:
    :return:
    """
    query = urlparse(url).query
    params = parse_qs(query)
    result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params}
    return result
--- a/org_news/run.py
+++ b/org_news/run.py
@ -0,0 +1,32 @@
 # -*- coding: utf-8 -*-
 # @Time    : 2025/7/25 13:39
 # @Author  : zhaoxiangpeng
 # @File    : run.py
 import os
 import re
 from scrapy.crawler import CrawlerProcess
 from scrapy.settings import Settings
 from scrapy.utils.project import get_project_settings
 regex = re.compile(r'^(org_news_).*?(_lib)$')
 def load_spider_script(path):
    """
    加载特定的爬虫脚本
    """
    scripts = os.listdir(path)
    spiders = []
    for script in scripts:
        if not script.endswith('.py'):
            continue
        spider_name = script[:-3]
        if bool(regex.search(spider_name)):
            spiders.append(spider_name)
    return spiders
 process = CrawlerProcess(get_project_settings())
 # process.crawl('org_news_sjtu_lib')
 process.crawl('org_news_xjtu_lib')
 process.start()
--- a/org_news/scrapy.cfg
+++ b/org_news/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = org_news.settings
 [deploy]
 #url = http://localhost:6800/
 project = org_news