cssci:根据id进行采集
parent
7f16b4da3c
commit
61129eadf2
@ -0,0 +1,39 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Time : 2026/1/20 17:06
|
||||||
|
# @Author : zhaoxiangpeng
|
||||||
|
# @File : crawl_article_by_id.py
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from typing import List
|
||||||
|
import redis
|
||||||
|
from twisted.internet import defer
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
from science_article_cssci.spiders.cssci_article_by_id import CssciArticleByIdSpider
|
||||||
|
|
||||||
|
|
||||||
|
def push_task():
|
||||||
|
settings = get_project_settings()
|
||||||
|
r = redis.StrictRedis.from_url(settings.get("REDIS_URL"))
|
||||||
|
r.lpush(
|
||||||
|
"cssci_article_by_id:start_urls",
|
||||||
|
*[
|
||||||
|
json.dumps({'third_id': '11G0412025010007'}, ensure_ascii=False),
|
||||||
|
json.dumps({'third_id': '11C1172023010002'}, ensure_ascii=False),
|
||||||
|
json.dumps({'third_id': '11J0092023020008'}, ensure_ascii=False),
|
||||||
|
json.dumps({'third_id': '44Z0712023010003'}, ensure_ascii=False),
|
||||||
|
json.dumps({'third_id': '11D1022023010001'}, ensure_ascii=False),
|
||||||
|
json.dumps({'third_id': '22D1042023010007'}, ensure_ascii=False),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def starter():
|
||||||
|
process = CrawlerProcess(get_project_settings())
|
||||||
|
process.crawl(CssciArticleByIdSpider)
|
||||||
|
process.start()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
push_task()
|
||||||
|
starter()
|
||||||
Loading…
Reference in New Issue