From 25ffff7604afdc6a0d0427af9b3eddf92a3bca03 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Tue, 5 Aug 2025 15:39:23 +0800 Subject: [PATCH] add:org_news commit to kc --- org_news/org_news/__init__.py | 0 org_news/org_news/items.py | 25 ++++ org_news/org_news/main.py | 11 ++ org_news/org_news/middlewares.py | 100 ++++++++++++++++ org_news/org_news/pipelines.py | 67 +++++++++++ org_news/org_news/scripts/create_table.py | 33 +++++ org_news/org_news/scripts/export_data.py | 4 + org_news/org_news/scripts/task_push.py | 26 ++++ .../org_news/selector_cfg/a_insert_xpath.py | 61 ++++++++++ .../selector_cfg/library_selector.txt | 22 ++++ org_news/org_news/settings.py | 101 ++++++++++++++++ org_news/org_news/spiders/__init__.py | 4 + .../org_news/spiders/org_news_distributed.py | 92 ++++++++++++++ .../org_news/spiders/org_news_fudan_lib.py | 66 ++++++++++ .../org_news/spiders/org_news_hust_lib.py | 10 ++ .../org_news/spiders/org_news_lib_browser.py | 10 ++ .../org_news/spiders/org_news_lib_database.py | 76 ++++++++++++ .../org_news/spiders/org_news_lib_test.py | 19 +++ .../org_news/spiders/org_news_sjtu_lib.py | 39 ++++++ org_news/org_news/spiders/org_news_whu_lib.py | 10 ++ .../org_news/spiders/org_news_xjtu_lib.py | 113 ++++++++++++++++++ .../spiders/seu_lib_resource_dynamics.py | 27 +++++ org_news/org_news/utils/__init__.py | 4 + org_news/org_news/utils/read_cfg.py | 32 +++++ org_news/org_news/utils/tools.py | 18 +++ org_news/run.py | 32 +++++ org_news/scrapy.cfg | 11 ++ 27 files changed, 1013 insertions(+) create mode 100644 org_news/org_news/__init__.py create mode 100644 org_news/org_news/items.py create mode 100644 org_news/org_news/main.py create mode 100644 org_news/org_news/middlewares.py create mode 100644 org_news/org_news/pipelines.py create mode 100644 org_news/org_news/scripts/create_table.py create mode 100644 org_news/org_news/scripts/export_data.py create mode 100644 org_news/org_news/scripts/task_push.py create mode 100644 org_news/org_news/selector_cfg/a_insert_xpath.py create mode 100644 org_news/org_news/selector_cfg/library_selector.txt create mode 100644 org_news/org_news/settings.py create mode 100644 org_news/org_news/spiders/__init__.py create mode 100644 org_news/org_news/spiders/org_news_distributed.py create mode 100644 org_news/org_news/spiders/org_news_fudan_lib.py create mode 100644 org_news/org_news/spiders/org_news_hust_lib.py create mode 100644 org_news/org_news/spiders/org_news_lib_browser.py create mode 100644 org_news/org_news/spiders/org_news_lib_database.py create mode 100644 org_news/org_news/spiders/org_news_lib_test.py create mode 100644 org_news/org_news/spiders/org_news_sjtu_lib.py create mode 100644 org_news/org_news/spiders/org_news_whu_lib.py create mode 100644 org_news/org_news/spiders/org_news_xjtu_lib.py create mode 100644 org_news/org_news/spiders/seu_lib_resource_dynamics.py create mode 100644 org_news/org_news/utils/__init__.py create mode 100644 org_news/org_news/utils/read_cfg.py create mode 100644 org_news/org_news/utils/tools.py create mode 100644 org_news/run.py create mode 100644 org_news/scrapy.cfg diff --git a/org_news/org_news/__init__.py b/org_news/org_news/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/org_news/org_news/items.py b/org_news/org_news/items.py new file mode 100644 index 0000000..4aeed24 --- /dev/null +++ b/org_news/org_news/items.py @@ -0,0 +1,25 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class OrgNewsItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + title = scrapy.Field() + pub_time = scrapy.Field() + news_link = scrapy.Field() + + updated_at = scrapy.Field() + + +class OrgNewsDatabaseItem(OrgNewsItem): + title = scrapy.Field() + news_label = scrapy.Field() + news_content = scrapy.Field() + tags = scrapy.Field() + news_source = scrapy.Field() + spider_cls = scrapy.Field() diff --git a/org_news/org_news/main.py b/org_news/org_news/main.py new file mode 100644 index 0000000..52beec7 --- /dev/null +++ b/org_news/org_news/main.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/21 17:29 +# @Author : zhaoxiangpeng +# @File : main.py + +from scrapy.cmdline import execute + + +execute('scrapy crawl org_news_distributed'.split()) +# execute('scrapy crawl org_news_lib_test'.split()) +# execute('scrapy crawl org_news_sjtu_lib'.split()) diff --git a/org_news/org_news/middlewares.py b/org_news/org_news/middlewares.py new file mode 100644 index 0000000..90c6786 --- /dev/null +++ b/org_news/org_news/middlewares.py @@ -0,0 +1,100 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class OrgNewsSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # maching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class OrgNewsDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/org_news/org_news/pipelines.py b/org_news/org_news/pipelines.py new file mode 100644 index 0000000..55b7313 --- /dev/null +++ b/org_news/org_news/pipelines.py @@ -0,0 +1,67 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface + +import re +import pymongo +from itemadapter import ItemAdapter + + +class OrgNewsPipeline: + def process_item(self, item, spider): + return item + + +class NewsTitleClassifyPipeline: + __KEYWORDS__ = dict( + Database=['开通', '试用', '停订', '新增', '时长'], + HumanAffairs=['现在馆长', '馆长更换'] + ) + keyword_db_pattern = re.compile('|'.join(__KEYWORDS__['Database'])) + + def process_item(self, item, spider): + adapter = ItemAdapter(item) + news_title = adapter.get("title") + tags1 = self.keyword_db_pattern.findall(news_title) + item['tags'] = tags1 + return item + + +class NewsStandardPipeline: + content_standard_pattern = re.compile(r'[\r\n\s]') + + def process_item(self, item, spider): + adapter = ItemAdapter(item) + news_content = adapter.get("news_content") + item['news_content'] = self.content_standard_pattern.sub('', news_content) + return item + + +class MongoPipeline: + collection_name = "data_org_news" + + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) + return item diff --git a/org_news/org_news/scripts/create_table.py b/org_news/org_news/scripts/create_table.py new file mode 100644 index 0000000..e6a0772 --- /dev/null +++ b/org_news/org_news/scripts/create_table.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/24 9:23 +# @Author : zhaoxiangpeng +# @File : create_table.py + +import pymongo +from pymongo import MongoClient +from pymongo.database import Database + + +def mongo(): + client = MongoClient("mongodb://root:123456@192.168.1.211:27017/") + db = client['science2'] + return db + + +def create_news_collection(db: Database): + collection = db.get_collection('data_org_news') + # collection.create_index( + # keys=[] + # ) + collection.create_index( + keys=[ + ('news_source', pymongo.ASCENDING), + ], + background=True, + name='_news_s' + ) # 创建来源索引,这个字段是list + + +if __name__ == '__main__': + mongo_db = mongo() + create_news_collection(mongo_db) diff --git a/org_news/org_news/scripts/export_data.py b/org_news/org_news/scripts/export_data.py new file mode 100644 index 0000000..0ffa96f --- /dev/null +++ b/org_news/org_news/scripts/export_data.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/24 9:23 +# @Author : zhaoxiangpeng +# @File : export_data.py diff --git a/org_news/org_news/scripts/task_push.py b/org_news/org_news/scripts/task_push.py new file mode 100644 index 0000000..ddd477b --- /dev/null +++ b/org_news/org_news/scripts/task_push.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/24 11:06 +# @Author : zhaoxiangpeng +# @File : task_push.py + +import json +import redis + +import org_news.settings as settings +from org_news.utils.read_cfg import read_cfg, format_cfg + + +def do_test(): + r = redis.StrictRedis.from_url(settings.REDIS_URL) + for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'): + r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':'))) + + +def main(): + r = redis.StrictRedis.from_url(settings.REDIS_URL) + for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'): + r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':'))) + + +if __name__ == '__main__': + main() diff --git a/org_news/org_news/selector_cfg/a_insert_xpath.py b/org_news/org_news/selector_cfg/a_insert_xpath.py new file mode 100644 index 0000000..3ae3ddc --- /dev/null +++ b/org_news/org_news/selector_cfg/a_insert_xpath.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/23 11:27 +# @Author : zhaoxiangpeng +# @File : a_insert_xpath.py + +import json +from pprint import pprint + + +class CfgTemplate: + org_name = None + org_domain = None + second_org_name = None + second_org_domain = None + news_module = None + news_module_link = None + list_s = None + title_s = None + datetime_s = None + news_link_s = None + label_s = None + content_s = None + spider_cls = None + invalid = None + + +def insert2testCfgFile(s_cfg: dict): + pprint(s_cfg) + with open('library_selector_test.txt', 'w+', encoding='utf-8') as f: + f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n') + + +def insert2cfgFile(s_cfg: dict): + pprint(s_cfg) + with open('library_selector.txt', 'a+', encoding='utf-8') as f: + f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n') + + +def main(): + module = CfgTemplate() + module.__setattr__('org_name', '西安交通大学') # 机构名称 + module.__setattr__('org_domain', 'https://www.xjtu.edu.cn/') # 机构域名 + module.__setattr__('second_org_name', '图书馆') # 二级机构名称 + module.__setattr__('second_org_domain', 'http://www.lib.xjtu.edu.cn/') # 二级机构域名 + module.__setattr__('news_module', '通知公告') # 新闻所在模块名称 + module.__setattr__('news_module_link', 'http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362') # 新闻所在模块链接 + module.__setattr__('list_s', None) # 新闻列表选择器 + module.__setattr__('title_s', None) # 新闻标题选择器 + module.__setattr__('news_link_s', None) # 新闻链接选择器 + module.__setattr__('datetime_s', None) # 发布日期时间选择器 + module.__setattr__('label_s', None) # 新闻标签选择器 + module.__setattr__('content_s', None) # 正文内容选择器 + module.__setattr__('spider_cls', 'org_news_lib_database') # 爬虫类 + # module.__setattr__('invalid', 1) # 爬虫类 + print(module) + insert2testCfgFile(module.__dict__) + insert2cfgFile(module.__dict__) + + +if __name__ == '__main__': + main() diff --git a/org_news/org_news/selector_cfg/library_selector.txt b/org_news/org_news/selector_cfg/library_selector.txt new file mode 100644 index 0000000..8a45897 --- /dev/null +++ b/org_news/org_news/selector_cfg/library_selector.txt @@ -0,0 +1,22 @@ +{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "新闻资讯", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=263", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=264", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "通知公告", "news_module_link": "https://lib.tsinghua.edu.cn/tzgg.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.tsinghua.edu.cn/zydt.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "活动日历", "news_module_link": "https://lib.tsinghua.edu.cn/hdrl.htm", "list_s": "//div[@id=\"kzyc2\"]//div[@class=\"rl-box\"]/ul//li", "title_s": "./div[contains(@class, \"rl-title\")]/a/text()", "news_link_s": "./div[contains(@class, \"rl-title\")]/a/@href", "datetime_s": "./div[@class=\"rl-label\"]/span[@class=\"rl-date\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"library-content-content\"]/div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "新闻", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/261xwlb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]/div[@class=\"sub_0261 ul-inline\"]/ul/li", "title_s": "./a/div/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"} +{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "公告", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/262gglb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"} +{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "资源", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/gm/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"} +{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "活动", "news_module_link": "https://www.lib.pku.edu.cn/4whjy/45hdzljz/453qb/index.htm", "list_s": "//div[@class=\"wrap_sub0435\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./div/div/div[@class=\"bottom\"]/a/text()", "news_link_s": "./div/div/div[@class=\"bottom\"]/a/@href", "datetime_s": "./div/div/div[@class=\"bottom\"]/div[@class=\"info\"]/p/i[contains(@class, \"icon-shijianfuxing\")]/../text()", "label_s": "./div/div/a[contains(@class, \"type\")]/text()", "content_s": "string(//div[@class=\"content\"]/div/div[@class=\"wrap_sub_0262\"])", "spider_cls": "org_news_lib_browser", "invalid": 1} +{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "通知公告", "news_module_link": "https://libweb.zju.edu.cn/39478/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://libweb.zju.edu.cn/55543/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "本馆新闻", "news_module_link": "https://libweb.zju.edu.cn/55989/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "新闻通告", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=3&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"} +{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "资源动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=4&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"} +{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "融媒体动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=26&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": null, "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"} +{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.nju.edu.cn/dzzy/zydt1.htm", "list_s": "//div[@class=\"zydt\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"time\"]/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "新闻通知", "news_module_link": "https://lib.nju.edu.cn/xw/xwtz.htm", "list_s": "//div[@class=\"gqzx-list\"]/ul/li", "title_s": "./a/text()", "news_link_s": "./a/@href", "datetime_s": "./span/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/资源动态/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p/text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "服务公告", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/服务公告/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "讲座培训", "news_module_link": "https://lib.ustc.edu.cn/category/讲座培训/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"} +{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "资源信息", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493363", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1} +{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "通知公告", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1} diff --git a/org_news/org_news/settings.py b/org_news/org_news/settings.py new file mode 100644 index 0000000..5c2885c --- /dev/null +++ b/org_news/org_news/settings.py @@ -0,0 +1,101 @@ +# Scrapy settings for org_news project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "org_news" + +SPIDER_MODULES = ["org_news.spiders"] +NEWSPIDER_MODULE = "org_news.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "org_news (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 1 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +DEFAULT_REQUEST_HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36" +} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "org_news.middlewares.OrgNewsSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html + +RETRY_ENABLED = True +RETRY_TIMES = 2 # 重试3次 +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码 +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550 + # "org_news.middlewares.OrgNewsDownloaderMiddleware": 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "org_news.pipelines.OrgNewsPipeline": 300, + "org_news.pipelines.NewsTitleClassifyPipeline": 400, + "org_news.pipelines.NewsStandardPipeline": 410, + # "org_news.pipelines.MongoPipeline": 500, +} + +MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/" +MONGO_DATABASE = "science2" + +REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/org_news/org_news/spiders/__init__.py b/org_news/org_news/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/org_news/org_news/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/org_news/org_news/spiders/org_news_distributed.py b/org_news/org_news/spiders/org_news_distributed.py new file mode 100644 index 0000000..648a394 --- /dev/null +++ b/org_news/org_news/spiders/org_news_distributed.py @@ -0,0 +1,92 @@ +import logging +import json +from datetime import datetime +import scrapy +from scrapy_redis.spiders import RedisSpider +from scrapy_redis.utils import bytes_to_str +from org_news.items import OrgNewsDatabaseItem + + +class OrgNewsDistributedSpider(RedisSpider): + name = "org_news_distributed" + + custom_settings = dict( # 修改调度器 + SCHEDULER="scrapy_redis.scheduler.Scheduler", + # 修改去重工具 + DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter", + # 开启数据持久化 + SCHEDULER_PERSIST=True, + # REDIS_HOST='192.168.1.211', + # REDIS_PORT=6379, + # # 验证数据库密码 + # REDIS_PARAMS={ + # 'password': 'kcidea1509', + # }, + ) + + def make_request_from_data(self, data): + formatted_data = bytes_to_str(data, self.redis_encoding) + cfg = json.loads(formatted_data) + + if cfg['spider_cls'] != 'org_news_lib_database': + self.logger.warning('不需要 %s 消费,跳过\n配置为:%s' % (self.name, cfg)) + return [] + list_selector = cfg.pop('list_selector') + detail_selector = cfg.pop('detail_selector') + yield scrapy.Request(url=cfg['news_module_link'], dont_filter=True, + meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector), + callback=self.parse_news_list) + + def parse_news_list(self, response): + """ + 解析新闻列表页 + """ + req_meta = response.meta + s_cfg = req_meta['s_cfg'] + current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]) + list_selector = req_meta.get('list_selector') + nodes = response.xpath(list_selector['list_s']) + news_label_s = list_selector.get('label_s', None) + if not nodes: + self.logger.warning(""" + 当前采集模块:%s + 没有采集到新闻链接 + 资源页链接:%s""" % (current_module, req_meta['s_cfg']['news_module_link'])) + else: + self.logger.info(""" + 当前采集模块:%s + 没有采集到新闻链接 %s 个 + 资源页链接:%s""" % (current_module, len(nodes), req_meta['s_cfg']['news_module_link'])) + for node in nodes: + list_data = dict( + title=node.xpath(list_selector['title_s']).get(), + pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(), + pub_link=list_selector['news_link_s'] and response.urljoin( + node.xpath(list_selector['news_link_s']).get()), + news_label=news_label_s and node.xpath(news_label_s).get() + ) + if not list_data['pub_link']: + self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING) + continue + yield response.follow(list_data['pub_link'], + meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'], + detail_selector=req_meta['detail_selector']), + callback=self.parse_news_detail) + + def parse_news_detail(self, response): + req_meta = response.meta + s_cfg = req_meta.get('s_cfg') + detail_selector = req_meta.get('detail_selector') + list_data = req_meta.get('list_data', {}) + contents = response.xpath(detail_selector['content_s']).getall() + text = '\n'.join([s.strip() for s in contents]) + news_item = OrgNewsDatabaseItem() + news_item['title'] = list_data.get('title') + news_item['pub_time'] = list_data.get('pub_time') + news_item['news_link'] = list_data.get('pub_link') + news_item['news_label'] = list_data.get('news_label', None) + news_item['news_content'] = text + news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']] + news_item['spider_cls'] = s_cfg['spider_cls'] + news_item['updated_at'] = datetime.now() + yield news_item diff --git a/org_news/org_news/spiders/org_news_fudan_lib.py b/org_news/org_news/spiders/org_news_fudan_lib.py new file mode 100644 index 0000000..3c88759 --- /dev/null +++ b/org_news/org_news/spiders/org_news_fudan_lib.py @@ -0,0 +1,66 @@ +import scrapy +from org_news.items import OrgNewsDatabaseItem + + +class OrgNewsFudanLibSpider(scrapy.Spider): + name = "org_news_fudan_lib" + allowed_domains = ["www.fudan.edu.cn"] + start_urls = ["https://www.fudan.edu.cn/"] + cfgs = [ + dict(org_name='复旦大学', + org_domain='https://www.fudan.edu.cn/', + second_org_name='图书馆', + second_org_domain='https://library.fudan.edu.cn/', + news_module='资源动态', + news_module_link='https://library.fudan.edu.cn/zydtx/list.htm') + ] + + def start_requests(self): + yield scrapy.FormRequest(url="https://library.fudan.edu.cn/_wp3services/generalQuery?queryObj=articles", + formdata=dict( + siteId=928, rows=14, columnId=42893, pageIndex=1, + returnInfos='[{"field":"title","name":"title"},{"field":"modifyTime","pattern":[{"name":"d","value":"yyyy-MM-dd"}],"name":"modifyTime"}]', + conditions='[]', orders='[{"field":"modifyTime","type":"desc"}]' + )) + + def parse_news_list(self, response): + """ + 解析新闻列表页 + """ + req_meta = response.meta + list_selector = req_meta.get('list_selector') + nodes = response.xpath(list_selector['list_s']) + news_label_s = list_selector.get('label_s', None) + for node in nodes: + list_data = dict( + title=node.xpath(list_selector['title_s']).get(), + pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(), + pub_link=list_selector['news_link_s'] and response.urljoin( + node.xpath(list_selector['news_link_s']).get()), + news_label=news_label_s and node.xpath(news_label_s).get() + ) + if not list_data['pub_link']: + self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING) + continue + yield response.follow(list_data['pub_link'], + meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'], + detail_selector=req_meta['detail_selector']), + callback=self.parse_news_detail) + + def parse_news_detail(self, response): + req_meta = response.meta + s_cfg = req_meta.get('s_cfg') + detail_selector = req_meta.get('detail_selector') + list_data = req_meta.get('list_data', {}) + contents = response.xpath(detail_selector['content_s']).getall() + text = '\n'.join([s.strip() for s in contents]) + news_item = OrgNewsDatabaseItem() + news_item['title'] = list_data.get('title') + news_item['pub_time'] = list_data.get('pub_time') + news_item['news_link'] = list_data.get('pub_link') + news_item['news_label'] = list_data.get('news_label', None) + news_item['news_content'] = text + news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']] + news_item['spider_cls'] = s_cfg['spider_cls'] + news_item['updated_at'] = datetime.now() + yield news_item diff --git a/org_news/org_news/spiders/org_news_hust_lib.py b/org_news/org_news/spiders/org_news_hust_lib.py new file mode 100644 index 0000000..8580ca3 --- /dev/null +++ b/org_news/org_news/spiders/org_news_hust_lib.py @@ -0,0 +1,10 @@ +import scrapy + + +class OrgNewsHustLibSpider(scrapy.Spider): + name = "org_news_hust_lib" + allowed_domains = ["www.hust.edu.cn"] + start_urls = ["https://www.hust.edu.cn/"] + + def parse(self, response): + pass diff --git a/org_news/org_news/spiders/org_news_lib_browser.py b/org_news/org_news/spiders/org_news_lib_browser.py new file mode 100644 index 0000000..ea53e88 --- /dev/null +++ b/org_news/org_news/spiders/org_news_lib_browser.py @@ -0,0 +1,10 @@ +import scrapy + + +class OrgNewsLibBrowserSpider(scrapy.Spider): + name = "org_news_lib_browser" + allowed_domains = ["lib.edu.cn"] + start_urls = ["https://lib.edu.cn"] + + def parse(self, response): + pass diff --git a/org_news/org_news/spiders/org_news_lib_database.py b/org_news/org_news/spiders/org_news_lib_database.py new file mode 100644 index 0000000..e0ec7e3 --- /dev/null +++ b/org_news/org_news/spiders/org_news_lib_database.py @@ -0,0 +1,76 @@ +import logging +import json +from datetime import datetime +import scrapy +from org_news.items import OrgNewsDatabaseItem +from org_news.utils.read_cfg import read_cfg, format_cfg + + +class OrgNewsLibDatabaseSpider(scrapy.Spider): + name = "org_news_lib_database" + # allowed_domains = ["lib.edu.cn"] + # start_urls = ["https://lib.edu.cn"] + + def start_requests(self): + for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'): + if cfg['spider_cls'] != self.name: continue + cfg = format_cfg(cfg) + list_selector = cfg.pop('list_selector') + detail_selector = cfg.pop('detail_selector') + yield scrapy.Request(url=cfg['news_module_link'], + meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector), + callback=self.parse_news_list) + + def parse_news_list(self, response): + """ + 解析新闻列表页 + """ + req_meta = response.meta + s_cfg = req_meta['cfg'] + current_module = '-'.join([s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]) + list_selector = req_meta.get('list_selector') + nodes = response.xpath(list_selector['list_s']) + news_label_s = list_selector.get('label_s', None) + if not nodes: + self.logger.warning(""" + 当前采集模块:%s + 没有采集到新闻链接 + 资源页链接:%s""" % (current_module, req_meta['cfg']['news_module_link'])) + else: + self.logger.info(""" + 当前采集模块:%s + 没有采集到新闻链接 %s 个 + 资源页链接:%s""" % (current_module, len(nodes), req_meta['cfg']['news_module_link'])) + for node in nodes: + list_data = dict( + title=node.xpath(list_selector['title_s']).get(), + pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(), + pub_link=list_selector['news_link_s'] and response.urljoin( + node.xpath(list_selector['news_link_s']).get()), + news_label=news_label_s and node.xpath(news_label_s).get() + ) + if not list_data['pub_link']: + self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING) + continue + yield response.follow(list_data['pub_link'], + meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'], + detail_selector=req_meta['detail_selector']), + callback=self.parse_news_detail) + + def parse_news_detail(self, response): + req_meta = response.meta + s_cfg = req_meta.get('s_cfg') + detail_selector = req_meta.get('detail_selector') + list_data = req_meta.get('list_data', {}) + contents = response.xpath(detail_selector['content_s']).getall() + text = '\n'.join([s.strip() for s in contents]) + news_item = OrgNewsDatabaseItem() + news_item['title'] = list_data.get('title') + news_item['pub_time'] = list_data.get('pub_time') + news_item['news_link'] = list_data.get('pub_link') + news_item['news_label'] = list_data.get('news_label', None) + news_item['news_content'] = text + news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']] + news_item['spider_cls'] = s_cfg['spider_cls'] + news_item['updated_at'] = datetime.now() + yield news_item diff --git a/org_news/org_news/spiders/org_news_lib_test.py b/org_news/org_news/spiders/org_news_lib_test.py new file mode 100644 index 0000000..3f6bf41 --- /dev/null +++ b/org_news/org_news/spiders/org_news_lib_test.py @@ -0,0 +1,19 @@ +import scrapy + +from org_news.utils.read_cfg import read_cfg +from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider + + +class OrgNewsLibTestSpider(OrgNewsLibDatabaseSpider): + name = "org_news_lib_test" + + def start_requests(self): + for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'): + # if cfg['spider_cls'] != self.name: continue + list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), + title_s=cfg.pop('title_s'), + datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s')) + detail_selector = dict(content_s=cfg.pop('content_s')) + yield scrapy.Request(url=cfg['news_module_link'], + meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector), + callback=self.parse_news_list) diff --git a/org_news/org_news/spiders/org_news_sjtu_lib.py b/org_news/org_news/spiders/org_news_sjtu_lib.py new file mode 100644 index 0000000..ffe5afc --- /dev/null +++ b/org_news/org_news/spiders/org_news_sjtu_lib.py @@ -0,0 +1,39 @@ + +import logging +import scrapy +from scrapy_redis.spiders import RedisSpider + +from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider +from org_news.utils.read_cfg import read_cfg +from org_news.utils import tools + + +class OrgNewsSjtuLibSpider(OrgNewsLibDatabaseSpider): + name = "org_news_sjtu_lib" + # allowed_domains = ["www.lib.sjtu.edu.cn"] + # start_urls = ["https://www.lib.sjtu.edu.cn/"] + + def parse_news_list(self, response): + """ + 解析新闻列表页 + """ + req_meta = response.meta + list_selector = req_meta.get('list_selector') + nodes = response.xpath(list_selector['list_s']) + news_label_s = list_selector.get('label_s', None) + for node in nodes: + list_data = dict( + title=node.xpath(list_selector['title_s']).get(), + pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(), + pub_link=list_selector['news_link_s'] and response.urljoin( + node.xpath(list_selector['news_link_s']).get()), + news_label=news_label_s and node.xpath(news_label_s).get() + ) + if not list_data['pub_link']: + self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING) + continue + params = tools.url_parse(list_data['pub_link']) + yield response.follow('https://www.lib.sjtu.edu.cn/f/content/content.shtml?id=%s' % params.get('id'), + meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'], + detail_selector=req_meta['detail_selector']), + callback=self.parse_news_detail) diff --git a/org_news/org_news/spiders/org_news_whu_lib.py b/org_news/org_news/spiders/org_news_whu_lib.py new file mode 100644 index 0000000..d6e74e5 --- /dev/null +++ b/org_news/org_news/spiders/org_news_whu_lib.py @@ -0,0 +1,10 @@ +import scrapy + + +class OrgNewsWhuLibSpider(scrapy.Spider): + name = "org_news_whu_lib" + allowed_domains = ["www.whu.edu.cn"] + start_urls = ["https://www.whu.edu.cn/"] + + def parse(self, response): + pass diff --git a/org_news/org_news/spiders/org_news_xjtu_lib.py b/org_news/org_news/spiders/org_news_xjtu_lib.py new file mode 100644 index 0000000..39e8156 --- /dev/null +++ b/org_news/org_news/spiders/org_news_xjtu_lib.py @@ -0,0 +1,113 @@ +import logging +import json +from datetime import datetime +from urllib.parse import urlparse + +import scrapy +from parsel.selector import Selector +from org_news.items import OrgNewsDatabaseItem +from org_news.utils.read_cfg import read_cfg, format_cfg +from org_news.utils import tools + +FIELD_KEYS = [str(i) for i in range(7)] +engineInstanceId = '361785' + + +def find_value_by_key(key: str, obj: dict): + for idx in FIELD_KEYS: + x = obj[idx] + if isinstance(x, dict) and x.get('key') == key: + return x.get('value') + + +class OrgNewsXjtuLibSpider(scrapy.Spider): + name = "org_news_xjtu_lib" + allowed_domains = ["www.lib.xjtu.edu.cn"] + request_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{module_id}/type/more-datas' + detail_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{news_id}/detail' + start_urls = ["https://www.xjtu.edu.cn/"] + + def start_requests(self): + for cfg in read_cfg( + 'D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'): + if cfg['spider_cls'] != self.name: continue + cfg = format_cfg(cfg) + list_selector = cfg.pop('list_selector') + detail_selector = cfg.pop('detail_selector') + path = urlparse(cfg['news_module_link']).path + module_id = path.split('/')[2] + params = tools.url_parse(cfg['news_module_link']) + yield scrapy.FormRequest(self.request_api.format(module_id=module_id), method="POST", dont_filter=True, + formdata=dict(engineInstanceId=engineInstanceId, pageNum='1', pageSize='20', + typeId=params.get('typeId'), topTypeId='', sw=''), + meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector, + websiteId=params.get('websiteId')), + callback=self.parse_news_list) + + def parse_news_list(self, response): + """ + 解析新闻列表页 + """ + req_meta = response.meta + websiteId = req_meta.get('websiteId') + if response.status != 200: + self.log('响应状态码异常') + return + resp_text = response.text + resp_json = json.loads(resp_text) + if resp_json.get('status') != 200 or resp_json.get('message') != "请求正确响应": + self.log('响应内容异常') + return + nodes = resp_json.get('data', {}).get('datas', {}).get('datas') + + for node in nodes: + list_data = dict( + title=find_value_by_key('标题', node), + pub_time=find_value_by_key('时间', node), + pub_link=node.get('url') and response.urljoin(node.get('url')), + news_label=None + ) + if not list_data['pub_link']: + self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING) + continue + payload = f'engineInstanceId={node.get("engineInstanceId", engineInstanceId)}&typeId={node.get("typeId")}&pageId=1&websiteId={websiteId}¤tBranch=0' + list_data['pub_link'] = pub_link = self.detail_api.format(news_id=node.get('id')) + '?' + payload + # self.logger.debug('publink: %s' % pub_link) + yield scrapy.Request(url=pub_link, + meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'], + detail_selector=req_meta['detail_selector']), + callback=self.parse_news_detail) + + def parse_news_detail(self, response): + req_meta = response.meta + s_cfg = req_meta.get('s_cfg') + list_data = req_meta.get('list_data', {}) + + contents = None + last_script = response.xpath('/html/script[last()]/text()') + if last_script: + """ + import re + re.findall(r'data: (\{"engineInstanceId".*?\}),\r\n', last_script, re.S | re.M) + """ + data_text = last_script.re_first(r'data: (\{"engineInstanceId".*?\}),\r\n') + try: + data_dic = json.loads(data_text) + selector = Selector(data_dic.get('content', ''), type='html') + except json.decoder.JSONDecodeError: + contents = None + else: + contents = selector.xpath('string(.)').get(None) + if not contents: + self.logger.warning("没有提取到数据") + + news_item = OrgNewsDatabaseItem() + news_item['title'] = list_data.get('title') + news_item['pub_time'] = list_data.get('pub_time') + news_item['news_link'] = list_data.get('pub_link') + news_item['news_label'] = list_data.get('news_label', None) + news_item['news_content'] = contents + news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']] + news_item['spider_cls'] = s_cfg['spider_cls'] + news_item['updated_at'] = datetime.now() + yield news_item diff --git a/org_news/org_news/spiders/seu_lib_resource_dynamics.py b/org_news/org_news/spiders/seu_lib_resource_dynamics.py new file mode 100644 index 0000000..540dc44 --- /dev/null +++ b/org_news/org_news/spiders/seu_lib_resource_dynamics.py @@ -0,0 +1,27 @@ +import scrapy + +from org_news.items import OrgNewsItem + + +class SeuLibResourceDynamicsSpider(scrapy.Spider): + name = "seu_lib_resource_dynamics" + allowed_domains = ["lib.seu.edu.cn"] + start_urls = ["https://lib.seu.edu.cn/list.php?fid=264&page=1"] + + def parse(self, response): + nodes = response.xpath('//div[@class="content-right-list"]/ul/li[@class="list-item"]') + for node in nodes: + list_data = dict( + title=node.xpath('./a/span/text()').get(), + pub_time=node.xpath('./span[@class="item-time"]/text()').get(), + pub_link=response.urljoin(node.xpath('./a/@href').get()), + ) + yield response.follow(list_data['pub_link'], callback=self.parse_news_detail, meta=dict(list_data=list_data)) + + def parse_news_detail(self, response): + req_meta = response.meta + list_data = req_meta.get('list_data', {}) + contents = response.xpath('string(//div[@class="article-wrap"])').getall() + text = '\n'.join([s.strip() for s in contents]) + print(text) + diff --git a/org_news/org_news/utils/__init__.py b/org_news/org_news/utils/__init__.py new file mode 100644 index 0000000..0bc4c99 --- /dev/null +++ b/org_news/org_news/utils/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/24 14:42 +# @Author : zhaoxiangpeng +# @File : __init__.py.py diff --git a/org_news/org_news/utils/read_cfg.py b/org_news/org_news/utils/read_cfg.py new file mode 100644 index 0000000..a9d8bd4 --- /dev/null +++ b/org_news/org_news/utils/read_cfg.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/23 15:02 +# @Author : zhaoxiangpeng +# @File : read_cfg.py + +import json + + +def read_cfg(file): + with open(file, 'r', encoding='utf-8') as f: + while line := f.readline(): + yield json.loads(line) + + +def format_cfg(cfg: dict): + list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), title_s=cfg.pop('title_s'), + datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s')) + detail_selector = dict(content_s=cfg.pop('content_s')) + new_cfg = cfg.copy() + new_cfg['list_selector'] = list_selector + new_cfg['detail_selector'] = detail_selector + return new_cfg + + +def test_read_cfg(): + for cfg in read_cfg('/scrapy-demo1/org_news/org_news/selector_cfg/library_selector.txt'): + print(cfg) + print(format_cfg(cfg)) + + +if __name__ == '__main__': + test_read_cfg() diff --git a/org_news/org_news/utils/tools.py b/org_news/org_news/utils/tools.py new file mode 100644 index 0000000..52336c5 --- /dev/null +++ b/org_news/org_news/utils/tools.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/24 14:42 +# @Author : zhaoxiangpeng +# @File : tools.py + +from urllib.parse import urlparse, parse_qs + + +def url_parse(url: str): + """ + url解析为dict + :param url: + :return: + """ + query = urlparse(url).query + params = parse_qs(query) + result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params} + return result diff --git a/org_news/run.py b/org_news/run.py new file mode 100644 index 0000000..f70555a --- /dev/null +++ b/org_news/run.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/7/25 13:39 +# @Author : zhaoxiangpeng +# @File : run.py +import os +import re +from scrapy.crawler import CrawlerProcess +from scrapy.settings import Settings +from scrapy.utils.project import get_project_settings +regex = re.compile(r'^(org_news_).*?(_lib)$') + + +def load_spider_script(path): + """ + 加载特定的爬虫脚本 + """ + scripts = os.listdir(path) + spiders = [] + for script in scripts: + if not script.endswith('.py'): + continue + spider_name = script[:-3] + if bool(regex.search(spider_name)): + spiders.append(spider_name) + return spiders + + +process = CrawlerProcess(get_project_settings()) + +# process.crawl('org_news_sjtu_lib') +process.crawl('org_news_xjtu_lib') +process.start() diff --git a/org_news/scrapy.cfg b/org_news/scrapy.cfg new file mode 100644 index 0000000..ab00335 --- /dev/null +++ b/org_news/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = org_news.settings + +[deploy] +#url = http://localhost:6800/ +project = org_news