wos:启动代码
parent
131b760adf
commit
1b0da2c41e
@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/12/11 13:56
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : crawl_article_by_qid.py
|
||||
import math
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from science_article_add.spiders.download_by_qid import DownloadByQidSpider
|
||||
|
||||
BATCH_DOWNLOAD_LIMIT = 500
|
||||
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
RECORDS_FOUND = 1486
|
||||
wos_download_todo = [
|
||||
|
||||
]
|
||||
|
||||
|
||||
def f(record_id: str, records_found: int):
|
||||
mark_start = 1
|
||||
mark_end = 0
|
||||
idx = 0
|
||||
for i in range(math.ceil(records_found / BATCH_DOWNLOAD_LIMIT)):
|
||||
idx += 1
|
||||
mark_end += BATCH_DOWNLOAD_LIMIT
|
||||
|
||||
if mark_end > records_found:
|
||||
mark_end = records_found
|
||||
|
||||
yield dict(
|
||||
record_id=record_id, batch=idx,
|
||||
mark_from=mark_start, mark_to=mark_end, records_found=records_found
|
||||
)
|
||||
|
||||
mark_start += BATCH_DOWNLOAD_LIMIT
|
||||
|
||||
|
||||
init_params = dict(
|
||||
record_id='02f30273-1342-4d61-9e51-c1ea1f5b2423-0190efdd10',
|
||||
mark_from=1, mark_to=500, records_found=10641
|
||||
)
|
||||
process.crawl(DownloadByQidSpider, **init_params)
|
||||
process.start()
|
||||
@ -0,0 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2025/12/11 17:07
|
||||
# @Author : zhaoxiangpeng
|
||||
# @File : crawl_article_by_ut.py
|
||||
import math
|
||||
import time
|
||||
import logging
|
||||
from twisted.internet import defer
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from science_article_add.spiders.wos_download import WosDownloadSpider
|
||||
|
||||
logging.getLogger('pymongo').setLevel(logging.WARNING)
|
||||
logger = logging.getLogger(__name__)
|
||||
BATCH_DOWNLOAD_LIMIT = 500
|
||||
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def crawl_sequentially():
|
||||
settings = get_project_settings()
|
||||
from pymongo import MongoClient
|
||||
client = MongoClient(settings.get("MONGO_URI"))
|
||||
db = client.get_database(settings.get("MONGO_DATABASE"))
|
||||
collection = db.get_collection("todo_ids_wos")
|
||||
|
||||
def f():
|
||||
count = collection.count_documents(filter={"state": 0})
|
||||
return count
|
||||
|
||||
while count_doc := f():
|
||||
logger.info('待下载数量 %d' % count_doc)
|
||||
yield process.crawl(WosDownloadSpider)
|
||||
time.sleep(60)
|
||||
|
||||
process.stop() # 所有爬虫结束后关闭事件循环
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
crawl_sequentially()
|
||||
process.start() # 阻塞直到所有爬虫完成
|
||||
Loading…
Reference in New Issue