You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
42 lines
1.2 KiB
Python
42 lines
1.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
# @Time : 2025/12/11 17:07
|
|
# @Author : zhaoxiangpeng
|
|
# @File : crawl_article_by_ut.py
|
|
import math
|
|
import time
|
|
import logging
|
|
from twisted.internet import defer
|
|
from scrapy.crawler import CrawlerProcess
|
|
from scrapy.utils.project import get_project_settings
|
|
from science_article_add.spiders.wos_download import WosDownloadSpider
|
|
|
|
logging.getLogger('pymongo').setLevel(logging.WARNING)
|
|
logger = logging.getLogger(__name__)
|
|
BATCH_DOWNLOAD_LIMIT = 500
|
|
|
|
|
|
@defer.inlineCallbacks
|
|
def crawl_sequentially():
|
|
settings = get_project_settings()
|
|
from pymongo import MongoClient
|
|
client = MongoClient(settings.get("MONGO_URI"))
|
|
db = client.get_database(settings.get("MONGO_DATABASE"))
|
|
collection = db.get_collection("todo_ids_wos")
|
|
|
|
def f():
|
|
count = collection.count_documents(filter={"state": 0})
|
|
return count
|
|
|
|
while count_doc := f():
|
|
logger.info('待下载数量 %d' % count_doc)
|
|
yield process.crawl(WosDownloadSpider)
|
|
time.sleep(60)
|
|
|
|
process.stop() # 所有爬虫结束后关闭事件循环
|
|
|
|
|
|
if __name__ == '__main__':
|
|
process = CrawlerProcess(get_project_settings())
|
|
crawl_sequentially()
|
|
process.start() # 阻塞直到所有爬虫完成
|