diff --git a/science_article_cssci/starter/crawl_article_by_id.py b/science_article_cssci/starter/crawl_article_by_id.py new file mode 100644 index 0000000..480aa18 --- /dev/null +++ b/science_article_cssci/starter/crawl_article_by_id.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# @Time : 2026/1/20 17:06 +# @Author : zhaoxiangpeng +# @File : crawl_article_by_id.py +import time +import logging +import json +from typing import List +import redis +from twisted.internet import defer +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from science_article_cssci.spiders.cssci_article_by_id import CssciArticleByIdSpider + + +def push_task(): + settings = get_project_settings() + r = redis.StrictRedis.from_url(settings.get("REDIS_URL")) + r.lpush( + "cssci_article_by_id:start_urls", + *[ + json.dumps({'third_id': '11G0412025010007'}, ensure_ascii=False), + json.dumps({'third_id': '11C1172023010002'}, ensure_ascii=False), + json.dumps({'third_id': '11J0092023020008'}, ensure_ascii=False), + json.dumps({'third_id': '44Z0712023010003'}, ensure_ascii=False), + json.dumps({'third_id': '11D1022023010001'}, ensure_ascii=False), + json.dumps({'third_id': '22D1042023010007'}, ensure_ascii=False), + ]) + + +def starter(): + process = CrawlerProcess(get_project_settings()) + process.crawl(CssciArticleByIdSpider) + process.start() + + +if __name__ == '__main__': + push_task() + starter()