add:org_news commit to kc
parent
2e0c44e983
commit
25ffff7604
@ -0,0 +1,25 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
title = scrapy.Field()
|
||||||
|
pub_time = scrapy.Field()
|
||||||
|
news_link = scrapy.Field()
|
||||||
|
|
||||||
|
updated_at = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsDatabaseItem(OrgNewsItem):
|
||||||
|
title = scrapy.Field()
|
||||||
|
news_label = scrapy.Field()
|
||||||
|
news_content = scrapy.Field()
|
||||||
|
tags = scrapy.Field()
|
||||||
|
news_source = scrapy.Field()
|
||||||
|
spider_cls = scrapy.Field()
|
@ -0,0 +1,11 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/21 17:29
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : main.py
|
||||||
|
|
||||||
|
from scrapy.cmdline import execute
|
||||||
|
|
||||||
|
|
||||||
|
execute('scrapy crawl org_news_distributed'.split())
|
||||||
|
# execute('scrapy crawl org_news_lib_test'.split())
|
||||||
|
# execute('scrapy crawl org_news_sjtu_lib'.split())
|
@ -0,0 +1,100 @@
|
|||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def process_start(self, start):
|
||||||
|
# Called with an async iterator over the spider start() method or the
|
||||||
|
# maching method of an earlier spider middleware.
|
||||||
|
async for item_or_request in start:
|
||||||
|
yield item_or_request
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
@ -0,0 +1,67 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
|
||||||
|
import re
|
||||||
|
import pymongo
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class NewsTitleClassifyPipeline:
|
||||||
|
__KEYWORDS__ = dict(
|
||||||
|
Database=['开通', '试用', '停订', '新增', '时长'],
|
||||||
|
HumanAffairs=['现在馆长', '馆长更换']
|
||||||
|
)
|
||||||
|
keyword_db_pattern = re.compile('|'.join(__KEYWORDS__['Database']))
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
adapter = ItemAdapter(item)
|
||||||
|
news_title = adapter.get("title")
|
||||||
|
tags1 = self.keyword_db_pattern.findall(news_title)
|
||||||
|
item['tags'] = tags1
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class NewsStandardPipeline:
|
||||||
|
content_standard_pattern = re.compile(r'[\r\n\s]')
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
adapter = ItemAdapter(item)
|
||||||
|
news_content = adapter.get("news_content")
|
||||||
|
item['news_content'] = self.content_standard_pattern.sub('', news_content)
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class MongoPipeline:
|
||||||
|
collection_name = "data_org_news"
|
||||||
|
|
||||||
|
def __init__(self, mongo_uri, mongo_db):
|
||||||
|
self.mongo_uri = mongo_uri
|
||||||
|
self.mongo_db = mongo_db
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
return cls(
|
||||||
|
mongo_uri=crawler.settings.get("MONGO_URI"),
|
||||||
|
mongo_db=crawler.settings.get("MONGO_DATABASE", "items"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.client = pymongo.MongoClient(self.mongo_uri)
|
||||||
|
self.db = self.client[self.mongo_db]
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.client.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
self.db[self.collection_name].insert_one(ItemAdapter(item).asdict())
|
||||||
|
return item
|
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/24 9:23
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : export_data.py
|
@ -0,0 +1,26 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/24 11:06
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : task_push.py
|
||||||
|
|
||||||
|
import json
|
||||||
|
import redis
|
||||||
|
|
||||||
|
import org_news.settings as settings
|
||||||
|
from org_news.utils.read_cfg import read_cfg, format_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def do_test():
|
||||||
|
r = redis.StrictRedis.from_url(settings.REDIS_URL)
|
||||||
|
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
|
||||||
|
r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
r = redis.StrictRedis.from_url(settings.REDIS_URL)
|
||||||
|
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector.txt'):
|
||||||
|
r.lpush('org_news_distributed:start_urls', json.dumps(format_cfg(cfg), ensure_ascii=False, separators=(',', ':')))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,61 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/23 11:27
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : a_insert_xpath.py
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
|
||||||
|
class CfgTemplate:
|
||||||
|
org_name = None
|
||||||
|
org_domain = None
|
||||||
|
second_org_name = None
|
||||||
|
second_org_domain = None
|
||||||
|
news_module = None
|
||||||
|
news_module_link = None
|
||||||
|
list_s = None
|
||||||
|
title_s = None
|
||||||
|
datetime_s = None
|
||||||
|
news_link_s = None
|
||||||
|
label_s = None
|
||||||
|
content_s = None
|
||||||
|
spider_cls = None
|
||||||
|
invalid = None
|
||||||
|
|
||||||
|
|
||||||
|
def insert2testCfgFile(s_cfg: dict):
|
||||||
|
pprint(s_cfg)
|
||||||
|
with open('library_selector_test.txt', 'w+', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def insert2cfgFile(s_cfg: dict):
|
||||||
|
pprint(s_cfg)
|
||||||
|
with open('library_selector.txt', 'a+', encoding='utf-8') as f:
|
||||||
|
f.write(json.dumps(s_cfg, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
module = CfgTemplate()
|
||||||
|
module.__setattr__('org_name', '西安交通大学') # 机构名称
|
||||||
|
module.__setattr__('org_domain', 'https://www.xjtu.edu.cn/') # 机构域名
|
||||||
|
module.__setattr__('second_org_name', '图书馆') # 二级机构名称
|
||||||
|
module.__setattr__('second_org_domain', 'http://www.lib.xjtu.edu.cn/') # 二级机构域名
|
||||||
|
module.__setattr__('news_module', '通知公告') # 新闻所在模块名称
|
||||||
|
module.__setattr__('news_module_link', 'http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362') # 新闻所在模块链接
|
||||||
|
module.__setattr__('list_s', None) # 新闻列表选择器
|
||||||
|
module.__setattr__('title_s', None) # 新闻标题选择器
|
||||||
|
module.__setattr__('news_link_s', None) # 新闻链接选择器
|
||||||
|
module.__setattr__('datetime_s', None) # 发布日期时间选择器
|
||||||
|
module.__setattr__('label_s', None) # 新闻标签选择器
|
||||||
|
module.__setattr__('content_s', None) # 正文内容选择器
|
||||||
|
module.__setattr__('spider_cls', 'org_news_lib_database') # 爬虫类
|
||||||
|
# module.__setattr__('invalid', 1) # 爬虫类
|
||||||
|
print(module)
|
||||||
|
insert2testCfgFile(module.__dict__)
|
||||||
|
insert2cfgFile(module.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -0,0 +1,22 @@
|
|||||||
|
{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "新闻资讯", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=263", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "东南大学", "org_domain": "https://www.seu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.seu.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.seu.edu.cn/list.php?fid=264", "list_s": "//div[@class=\"content-right-list\"]/ul/li[@class=\"list-item\"]", "title_s": "./a/span/text()", "datetime_s": "./span[@class=\"item-time\"]/text()", "news_link_s": "./a/@href", "label_s": null, "content_s": "string(//div[@class=\"article-wrap\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "通知公告", "news_module_link": "https://lib.tsinghua.edu.cn/tzgg.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.tsinghua.edu.cn/zydt.htm", "list_s": "//div[@class=\"main\"]/div/div[@class=\"g-box\"]/ul[@class=\"notice-list\"]/li", "title_s": "./div[@class=\"notice-list-tt\"]/a/text()", "news_link_s": "./div[@class=\"notice-list-tt\"]/a/@href", "datetime_s": "./div[@class=\"notice-date\"]/text()", "label_s": "./div[contains(@class, \"notice-label\")]/text()", "content_s": "string(//div[@class=\"main\"]/div/div[@class=\"g-box\"]/div[@class=\"col-main\"]/div[1]//div[@class=\"concon\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "清华大学", "org_domain": "https://www.tsinghua.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.tsinghua.edu.cn/", "news_module": "活动日历", "news_module_link": "https://lib.tsinghua.edu.cn/hdrl.htm", "list_s": "//div[@id=\"kzyc2\"]//div[@class=\"rl-box\"]/ul//li", "title_s": "./div[contains(@class, \"rl-title\")]/a/text()", "news_link_s": "./div[contains(@class, \"rl-title\")]/a/@href", "datetime_s": "./div[@class=\"rl-label\"]/span[@class=\"rl-date\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"library-content-content\"]/div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "新闻", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/261xwlb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]/div[@class=\"sub_0261 ul-inline\"]/ul/li", "title_s": "./a/div/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "公告", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/262gglb/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "资源", "news_module_link": "https://lib.pku.edu.cn/2xxzzfw/26xwgg/gm/index.htm", "list_s": "//div[@class=\"content\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./a/div[contains(@class, \"title\")]/text()", "news_link_s": "./a/@href", "datetime_s": "./a/div[contains(@class, \"time\")]/text()", "label_s": null, "content_s": "string(//div[@class=\"content\"]//div[contains(@class, \"page_article\")])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "北京大学", "org_domain": "https://www.pku.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.pku.edu.cn/", "news_module": "活动", "news_module_link": "https://www.lib.pku.edu.cn/4whjy/45hdzljz/453qb/index.htm", "list_s": "//div[@class=\"wrap_sub0435\"]/div[@class=\"row\"]//div[contains(@class, \"ul-inline\")]/ul/li", "title_s": "./div/div/div[@class=\"bottom\"]/a/text()", "news_link_s": "./div/div/div[@class=\"bottom\"]/a/@href", "datetime_s": "./div/div/div[@class=\"bottom\"]/div[@class=\"info\"]/p/i[contains(@class, \"icon-shijianfuxing\")]/../text()", "label_s": "./div/div/a[contains(@class, \"type\")]/text()", "content_s": "string(//div[@class=\"content\"]/div/div[@class=\"wrap_sub_0262\"])", "spider_cls": "org_news_lib_browser", "invalid": 1}
|
||||||
|
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "通知公告", "news_module_link": "https://libweb.zju.edu.cn/39478/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://libweb.zju.edu.cn/55543/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "浙江大学", "org_domain": "https://www.zju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://libweb.zju.edu.cn/", "news_module": "本馆新闻", "news_module_link": "https://libweb.zju.edu.cn/55989/list.htm", "list_s": "//div[@id=\"wp_news_w6\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"news_meta\"]/text()", "label_s": null, "content_s": "string(//div[@class=\"wp_articlecontent\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "新闻通告", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=3&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
|
||||||
|
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "资源动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=4&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": ".//div[@class=\"resource_content_tag\"]/ul/li/text()", "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
|
||||||
|
{"org_name": "上海交通大学", "org_domain": "https://www.sjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://www.lib.sjtu.edu.cn/f/main/index.shtml", "news_module": "融媒体动态", "news_module_link": "https://www.lib.sjtu.edu.cn/f/content/list.shtml?Lid=26&lang=zh-cn", "list_s": "//div[@class=\"result_div_list\"]/div/ul/li", "title_s": ".//div[@class=\"resource_content\"]/div[@class=\"resource_content_title\"]/text()", "news_link_s": ".//div[@class=\"resource_content_more\"]/a/@href", "datetime_s": ".//div[@class=\"resource_content_time\"]/text()", "label_s": null, "content_s": "string(/html/body/div)", "spider_cls": "org_news_sjtu_lib"}
|
||||||
|
{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.nju.edu.cn/dzzy/zydt1.htm", "list_s": "//div[@class=\"zydt\"]/ul/li", "title_s": "./span/a/text()", "news_link_s": "./span/a/@href", "datetime_s": "./span[@class=\"time\"]/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "南京大学", "org_domain": "https://www.nju.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.nju.edu.cn/", "news_module": "新闻通知", "news_module_link": "https://lib.nju.edu.cn/xw/xwtz.htm", "list_s": "//div[@class=\"gqzx-list\"]/ul/li", "title_s": "./a/text()", "news_link_s": "./a/@href", "datetime_s": "./span/text()", "label_s": null, "content_s": "string(//div[@id=\"vsb_content\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "资源动态", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/资源动态/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p/text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "服务公告", "news_module_link": "https://lib.ustc.edu.cn/category/cat_news/服务公告/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "中国科学技术大学", "org_domain": "https://www.ustc.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "https://lib.ustc.edu.cn/", "news_module": "讲座培训", "news_module_link": "https://lib.ustc.edu.cn/category/讲座培训/", "list_s": "//div[@id=\"myTabContent\"]/div/ul/li", "title_s": "./a/p[@class=\"ellipsis11\"]//text()", "news_link_s": "./a/@href", "datetime_s": "./a/span/text()", "label_s": "./a/samp/text()", "content_s": "string(//div[@class=\"post-body\"])", "spider_cls": "org_news_lib_database"}
|
||||||
|
{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "资源信息", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493363", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
|
||||||
|
{"org_name": "西安交通大学", "org_domain": "https://www.xjtu.edu.cn/", "second_org_name": "图书馆", "second_org_domain": "http://www.lib.xjtu.edu.cn/", "news_module": "通知公告", "news_module_link": "http://www.lib.xjtu.edu.cn/application/236872/more?wfwfid=17071&websiteId=27676&pageId=34094&originTypeId=&typeId=2493362", "list_s": null, "title_s": null, "news_link_s": null, "datetime_s": null, "label_s": null, "content_s": null, "spider_cls": "org_news_xjtu_lib", "invalid": 1}
|
@ -0,0 +1,101 @@
|
|||||||
|
# Scrapy settings for org_news project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "org_news"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["org_news.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "org_news.spiders"
|
||||||
|
|
||||||
|
ADDONS = {}
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "org_news (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Concurrency and throttling settings
|
||||||
|
#CONCURRENT_REQUESTS = 16
|
||||||
|
CONCURRENT_REQUESTS_PER_DOMAIN = 1
|
||||||
|
DOWNLOAD_DELAY = 1
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
DEFAULT_REQUEST_HEADERS = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en",
|
||||||
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "org_news.middlewares.OrgNewsSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
|
||||||
|
RETRY_ENABLED = True
|
||||||
|
RETRY_TIMES = 2 # 重试3次
|
||||||
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550
|
||||||
|
# "org_news.middlewares.OrgNewsDownloaderMiddleware": 543,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
"org_news.pipelines.OrgNewsPipeline": 300,
|
||||||
|
"org_news.pipelines.NewsTitleClassifyPipeline": 400,
|
||||||
|
"org_news.pipelines.NewsStandardPipeline": 410,
|
||||||
|
# "org_news.pipelines.MongoPipeline": 500,
|
||||||
|
}
|
||||||
|
|
||||||
|
MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/"
|
||||||
|
MONGO_DATABASE = "science2"
|
||||||
|
|
||||||
|
REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10'
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
#AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
@ -0,0 +1,66 @@
|
|||||||
|
import scrapy
|
||||||
|
from org_news.items import OrgNewsDatabaseItem
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsFudanLibSpider(scrapy.Spider):
|
||||||
|
name = "org_news_fudan_lib"
|
||||||
|
allowed_domains = ["www.fudan.edu.cn"]
|
||||||
|
start_urls = ["https://www.fudan.edu.cn/"]
|
||||||
|
cfgs = [
|
||||||
|
dict(org_name='复旦大学',
|
||||||
|
org_domain='https://www.fudan.edu.cn/',
|
||||||
|
second_org_name='图书馆',
|
||||||
|
second_org_domain='https://library.fudan.edu.cn/',
|
||||||
|
news_module='资源动态',
|
||||||
|
news_module_link='https://library.fudan.edu.cn/zydtx/list.htm')
|
||||||
|
]
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
yield scrapy.FormRequest(url="https://library.fudan.edu.cn/_wp3services/generalQuery?queryObj=articles",
|
||||||
|
formdata=dict(
|
||||||
|
siteId=928, rows=14, columnId=42893, pageIndex=1,
|
||||||
|
returnInfos='[{"field":"title","name":"title"},{"field":"modifyTime","pattern":[{"name":"d","value":"yyyy-MM-dd"}],"name":"modifyTime"}]',
|
||||||
|
conditions='[]', orders='[{"field":"modifyTime","type":"desc"}]'
|
||||||
|
))
|
||||||
|
|
||||||
|
def parse_news_list(self, response):
|
||||||
|
"""
|
||||||
|
解析新闻列表页
|
||||||
|
"""
|
||||||
|
req_meta = response.meta
|
||||||
|
list_selector = req_meta.get('list_selector')
|
||||||
|
nodes = response.xpath(list_selector['list_s'])
|
||||||
|
news_label_s = list_selector.get('label_s', None)
|
||||||
|
for node in nodes:
|
||||||
|
list_data = dict(
|
||||||
|
title=node.xpath(list_selector['title_s']).get(),
|
||||||
|
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
|
||||||
|
pub_link=list_selector['news_link_s'] and response.urljoin(
|
||||||
|
node.xpath(list_selector['news_link_s']).get()),
|
||||||
|
news_label=news_label_s and node.xpath(news_label_s).get()
|
||||||
|
)
|
||||||
|
if not list_data['pub_link']:
|
||||||
|
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
|
||||||
|
continue
|
||||||
|
yield response.follow(list_data['pub_link'],
|
||||||
|
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
|
||||||
|
detail_selector=req_meta['detail_selector']),
|
||||||
|
callback=self.parse_news_detail)
|
||||||
|
|
||||||
|
def parse_news_detail(self, response):
|
||||||
|
req_meta = response.meta
|
||||||
|
s_cfg = req_meta.get('s_cfg')
|
||||||
|
detail_selector = req_meta.get('detail_selector')
|
||||||
|
list_data = req_meta.get('list_data', {})
|
||||||
|
contents = response.xpath(detail_selector['content_s']).getall()
|
||||||
|
text = '\n'.join([s.strip() for s in contents])
|
||||||
|
news_item = OrgNewsDatabaseItem()
|
||||||
|
news_item['title'] = list_data.get('title')
|
||||||
|
news_item['pub_time'] = list_data.get('pub_time')
|
||||||
|
news_item['news_link'] = list_data.get('pub_link')
|
||||||
|
news_item['news_label'] = list_data.get('news_label', None)
|
||||||
|
news_item['news_content'] = text
|
||||||
|
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
|
||||||
|
news_item['spider_cls'] = s_cfg['spider_cls']
|
||||||
|
news_item['updated_at'] = datetime.now()
|
||||||
|
yield news_item
|
@ -0,0 +1,10 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsHustLibSpider(scrapy.Spider):
|
||||||
|
name = "org_news_hust_lib"
|
||||||
|
allowed_domains = ["www.hust.edu.cn"]
|
||||||
|
start_urls = ["https://www.hust.edu.cn/"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
@ -0,0 +1,10 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsLibBrowserSpider(scrapy.Spider):
|
||||||
|
name = "org_news_lib_browser"
|
||||||
|
allowed_domains = ["lib.edu.cn"]
|
||||||
|
start_urls = ["https://lib.edu.cn"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
@ -0,0 +1,19 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from org_news.utils.read_cfg import read_cfg
|
||||||
|
from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsLibTestSpider(OrgNewsLibDatabaseSpider):
|
||||||
|
name = "org_news_lib_test"
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for cfg in read_cfg('D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
|
||||||
|
# if cfg['spider_cls'] != self.name: continue
|
||||||
|
list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None),
|
||||||
|
title_s=cfg.pop('title_s'),
|
||||||
|
datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
|
||||||
|
detail_selector = dict(content_s=cfg.pop('content_s'))
|
||||||
|
yield scrapy.Request(url=cfg['news_module_link'],
|
||||||
|
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector),
|
||||||
|
callback=self.parse_news_list)
|
@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
import logging
|
||||||
|
import scrapy
|
||||||
|
from scrapy_redis.spiders import RedisSpider
|
||||||
|
|
||||||
|
from org_news.spiders.org_news_lib_database import OrgNewsLibDatabaseSpider
|
||||||
|
from org_news.utils.read_cfg import read_cfg
|
||||||
|
from org_news.utils import tools
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsSjtuLibSpider(OrgNewsLibDatabaseSpider):
|
||||||
|
name = "org_news_sjtu_lib"
|
||||||
|
# allowed_domains = ["www.lib.sjtu.edu.cn"]
|
||||||
|
# start_urls = ["https://www.lib.sjtu.edu.cn/"]
|
||||||
|
|
||||||
|
def parse_news_list(self, response):
|
||||||
|
"""
|
||||||
|
解析新闻列表页
|
||||||
|
"""
|
||||||
|
req_meta = response.meta
|
||||||
|
list_selector = req_meta.get('list_selector')
|
||||||
|
nodes = response.xpath(list_selector['list_s'])
|
||||||
|
news_label_s = list_selector.get('label_s', None)
|
||||||
|
for node in nodes:
|
||||||
|
list_data = dict(
|
||||||
|
title=node.xpath(list_selector['title_s']).get(),
|
||||||
|
pub_time=list_selector['datetime_s'] and node.xpath(list_selector['datetime_s']).get(),
|
||||||
|
pub_link=list_selector['news_link_s'] and response.urljoin(
|
||||||
|
node.xpath(list_selector['news_link_s']).get()),
|
||||||
|
news_label=news_label_s and node.xpath(news_label_s).get()
|
||||||
|
)
|
||||||
|
if not list_data['pub_link']:
|
||||||
|
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
|
||||||
|
continue
|
||||||
|
params = tools.url_parse(list_data['pub_link'])
|
||||||
|
yield response.follow('https://www.lib.sjtu.edu.cn/f/content/content.shtml?id=%s' % params.get('id'),
|
||||||
|
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
|
||||||
|
detail_selector=req_meta['detail_selector']),
|
||||||
|
callback=self.parse_news_detail)
|
@ -0,0 +1,10 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsWhuLibSpider(scrapy.Spider):
|
||||||
|
name = "org_news_whu_lib"
|
||||||
|
allowed_domains = ["www.whu.edu.cn"]
|
||||||
|
start_urls = ["https://www.whu.edu.cn/"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
@ -0,0 +1,113 @@
|
|||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from parsel.selector import Selector
|
||||||
|
from org_news.items import OrgNewsDatabaseItem
|
||||||
|
from org_news.utils.read_cfg import read_cfg, format_cfg
|
||||||
|
from org_news.utils import tools
|
||||||
|
|
||||||
|
FIELD_KEYS = [str(i) for i in range(7)]
|
||||||
|
engineInstanceId = '361785'
|
||||||
|
|
||||||
|
|
||||||
|
def find_value_by_key(key: str, obj: dict):
|
||||||
|
for idx in FIELD_KEYS:
|
||||||
|
x = obj[idx]
|
||||||
|
if isinstance(x, dict) and x.get('key') == key:
|
||||||
|
return x.get('value')
|
||||||
|
|
||||||
|
|
||||||
|
class OrgNewsXjtuLibSpider(scrapy.Spider):
|
||||||
|
name = "org_news_xjtu_lib"
|
||||||
|
allowed_domains = ["www.lib.xjtu.edu.cn"]
|
||||||
|
request_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{module_id}/type/more-datas'
|
||||||
|
detail_api = 'http://www.lib.xjtu.edu.cn/engine2/general/{news_id}/detail'
|
||||||
|
start_urls = ["https://www.xjtu.edu.cn/"]
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for cfg in read_cfg(
|
||||||
|
'D:\GitHouse\python-topic\scrapy-demo1\org_news\org_news\selector_cfg\library_selector_test.txt'):
|
||||||
|
if cfg['spider_cls'] != self.name: continue
|
||||||
|
cfg = format_cfg(cfg)
|
||||||
|
list_selector = cfg.pop('list_selector')
|
||||||
|
detail_selector = cfg.pop('detail_selector')
|
||||||
|
path = urlparse(cfg['news_module_link']).path
|
||||||
|
module_id = path.split('/')[2]
|
||||||
|
params = tools.url_parse(cfg['news_module_link'])
|
||||||
|
yield scrapy.FormRequest(self.request_api.format(module_id=module_id), method="POST", dont_filter=True,
|
||||||
|
formdata=dict(engineInstanceId=engineInstanceId, pageNum='1', pageSize='20',
|
||||||
|
typeId=params.get('typeId'), topTypeId='', sw=''),
|
||||||
|
meta=dict(s_cfg=cfg, list_selector=list_selector, detail_selector=detail_selector,
|
||||||
|
websiteId=params.get('websiteId')),
|
||||||
|
callback=self.parse_news_list)
|
||||||
|
|
||||||
|
def parse_news_list(self, response):
|
||||||
|
"""
|
||||||
|
解析新闻列表页
|
||||||
|
"""
|
||||||
|
req_meta = response.meta
|
||||||
|
websiteId = req_meta.get('websiteId')
|
||||||
|
if response.status != 200:
|
||||||
|
self.log('响应状态码异常')
|
||||||
|
return
|
||||||
|
resp_text = response.text
|
||||||
|
resp_json = json.loads(resp_text)
|
||||||
|
if resp_json.get('status') != 200 or resp_json.get('message') != "请求正确响应":
|
||||||
|
self.log('响应内容异常')
|
||||||
|
return
|
||||||
|
nodes = resp_json.get('data', {}).get('datas', {}).get('datas')
|
||||||
|
|
||||||
|
for node in nodes:
|
||||||
|
list_data = dict(
|
||||||
|
title=find_value_by_key('标题', node),
|
||||||
|
pub_time=find_value_by_key('时间', node),
|
||||||
|
pub_link=node.get('url') and response.urljoin(node.get('url')),
|
||||||
|
news_label=None
|
||||||
|
)
|
||||||
|
if not list_data['pub_link']:
|
||||||
|
self.log('没有找到link: %s' % list_data['title'], level=logging.WARNING)
|
||||||
|
continue
|
||||||
|
payload = f'engineInstanceId={node.get("engineInstanceId", engineInstanceId)}&typeId={node.get("typeId")}&pageId=1&websiteId={websiteId}¤tBranch=0'
|
||||||
|
list_data['pub_link'] = pub_link = self.detail_api.format(news_id=node.get('id')) + '?' + payload
|
||||||
|
# self.logger.debug('publink: %s' % pub_link)
|
||||||
|
yield scrapy.Request(url=pub_link,
|
||||||
|
meta=dict(list_data=list_data, s_cfg=req_meta['s_cfg'],
|
||||||
|
detail_selector=req_meta['detail_selector']),
|
||||||
|
callback=self.parse_news_detail)
|
||||||
|
|
||||||
|
def parse_news_detail(self, response):
|
||||||
|
req_meta = response.meta
|
||||||
|
s_cfg = req_meta.get('s_cfg')
|
||||||
|
list_data = req_meta.get('list_data', {})
|
||||||
|
|
||||||
|
contents = None
|
||||||
|
last_script = response.xpath('/html/script[last()]/text()')
|
||||||
|
if last_script:
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
re.findall(r'data: (\{"engineInstanceId".*?\}),\r\n', last_script, re.S | re.M)
|
||||||
|
"""
|
||||||
|
data_text = last_script.re_first(r'data: (\{"engineInstanceId".*?\}),\r\n')
|
||||||
|
try:
|
||||||
|
data_dic = json.loads(data_text)
|
||||||
|
selector = Selector(data_dic.get('content', ''), type='html')
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
contents = None
|
||||||
|
else:
|
||||||
|
contents = selector.xpath('string(.)').get(None)
|
||||||
|
if not contents:
|
||||||
|
self.logger.warning("没有提取到数据")
|
||||||
|
|
||||||
|
news_item = OrgNewsDatabaseItem()
|
||||||
|
news_item['title'] = list_data.get('title')
|
||||||
|
news_item['pub_time'] = list_data.get('pub_time')
|
||||||
|
news_item['news_link'] = list_data.get('pub_link')
|
||||||
|
news_item['news_label'] = list_data.get('news_label', None)
|
||||||
|
news_item['news_content'] = contents
|
||||||
|
news_item['news_source'] = [s_cfg['org_name'], s_cfg['second_org_name'], s_cfg['news_module']]
|
||||||
|
news_item['spider_cls'] = s_cfg['spider_cls']
|
||||||
|
news_item['updated_at'] = datetime.now()
|
||||||
|
yield news_item
|
@ -0,0 +1,27 @@
|
|||||||
|
import scrapy
|
||||||
|
|
||||||
|
from org_news.items import OrgNewsItem
|
||||||
|
|
||||||
|
|
||||||
|
class SeuLibResourceDynamicsSpider(scrapy.Spider):
|
||||||
|
name = "seu_lib_resource_dynamics"
|
||||||
|
allowed_domains = ["lib.seu.edu.cn"]
|
||||||
|
start_urls = ["https://lib.seu.edu.cn/list.php?fid=264&page=1"]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
nodes = response.xpath('//div[@class="content-right-list"]/ul/li[@class="list-item"]')
|
||||||
|
for node in nodes:
|
||||||
|
list_data = dict(
|
||||||
|
title=node.xpath('./a/span/text()').get(),
|
||||||
|
pub_time=node.xpath('./span[@class="item-time"]/text()').get(),
|
||||||
|
pub_link=response.urljoin(node.xpath('./a/@href').get()),
|
||||||
|
)
|
||||||
|
yield response.follow(list_data['pub_link'], callback=self.parse_news_detail, meta=dict(list_data=list_data))
|
||||||
|
|
||||||
|
def parse_news_detail(self, response):
|
||||||
|
req_meta = response.meta
|
||||||
|
list_data = req_meta.get('list_data', {})
|
||||||
|
contents = response.xpath('string(//div[@class="article-wrap"])').getall()
|
||||||
|
text = '\n'.join([s.strip() for s in contents])
|
||||||
|
print(text)
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/24 14:42
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : __init__.py.py
|
@ -0,0 +1,32 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/23 15:02
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : read_cfg.py
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def read_cfg(file):
|
||||||
|
with open(file, 'r', encoding='utf-8') as f:
|
||||||
|
while line := f.readline():
|
||||||
|
yield json.loads(line)
|
||||||
|
|
||||||
|
|
||||||
|
def format_cfg(cfg: dict):
|
||||||
|
list_selector = dict(list_s=cfg.pop('list_s'), label_s=cfg.pop('label_s', None), title_s=cfg.pop('title_s'),
|
||||||
|
datetime_s=cfg.pop('datetime_s'), news_link_s=cfg.pop('news_link_s'))
|
||||||
|
detail_selector = dict(content_s=cfg.pop('content_s'))
|
||||||
|
new_cfg = cfg.copy()
|
||||||
|
new_cfg['list_selector'] = list_selector
|
||||||
|
new_cfg['detail_selector'] = detail_selector
|
||||||
|
return new_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def test_read_cfg():
|
||||||
|
for cfg in read_cfg('/scrapy-demo1/org_news/org_news/selector_cfg/library_selector.txt'):
|
||||||
|
print(cfg)
|
||||||
|
print(format_cfg(cfg))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_read_cfg()
|
@ -0,0 +1,18 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/24 14:42
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : tools.py
|
||||||
|
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
|
|
||||||
|
def url_parse(url: str):
|
||||||
|
"""
|
||||||
|
url解析为dict
|
||||||
|
:param url:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
query = urlparse(url).query
|
||||||
|
params = parse_qs(query)
|
||||||
|
result = {key: params[key][0] if params[key].__len__() == 1 else params[key] for key in params}
|
||||||
|
return result
|
@ -0,0 +1,32 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2025/7/25 13:39
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : run.py
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.settings import Settings
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
regex = re.compile(r'^(org_news_).*?(_lib)$')
|
||||||
|
|
||||||
|
|
||||||
|
def load_spider_script(path):
|
||||||
|
"""
|
||||||
|
加载特定的爬虫脚本
|
||||||
|
"""
|
||||||
|
scripts = os.listdir(path)
|
||||||
|
spiders = []
|
||||||
|
for script in scripts:
|
||||||
|
if not script.endswith('.py'):
|
||||||
|
continue
|
||||||
|
spider_name = script[:-3]
|
||||||
|
if bool(regex.search(spider_name)):
|
||||||
|
spiders.append(spider_name)
|
||||||
|
return spiders
|
||||||
|
|
||||||
|
|
||||||
|
process = CrawlerProcess(get_project_settings())
|
||||||
|
|
||||||
|
# process.crawl('org_news_sjtu_lib')
|
||||||
|
process.crawl('org_news_xjtu_lib')
|
||||||
|
process.start()
|
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = org_news.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = org_news
|
Loading…
Reference in New Issue