From b14a5037583662a716f58b0dc523177f1216b9c3 Mon Sep 17 00:00:00 2001 From: zhaoxiangpeng <1943364377@qq.com> Date: Thu, 25 Sep 2025 10:57:54 +0800 Subject: [PATCH] =?UTF-8?q?add:=20=E9=AB=98=E8=A2=AB=E5=BC=95=E5=AD=A6?= =?UTF-8?q?=E8=80=85=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../highly_cited_scholars/__init__.py | 0 .../highly_cited_scholars/items.py | 22 ++++ .../highly_cited_scholars/middlewares.py | 100 ++++++++++++++++++ .../highly_cited_scholars/models/__init__.py | 4 + .../models/cnki_model.py | 19 ++++ .../highly_cited_scholars/pipelines.py | 40 +++++++ .../highly_cited_scholars/settings.py | 95 +++++++++++++++++ .../highly_cited_scholars/spiders/__init__.py | 4 + .../spiders/cnki_hcs_list2024.py | 64 +++++++++++ highly_cited_scholars/run.py | 12 +++ highly_cited_scholars/scrapy.cfg | 11 ++ .../start_scripts/cnki_hcs.py | 14 +++ 12 files changed, 385 insertions(+) create mode 100644 highly_cited_scholars/highly_cited_scholars/__init__.py create mode 100644 highly_cited_scholars/highly_cited_scholars/items.py create mode 100644 highly_cited_scholars/highly_cited_scholars/middlewares.py create mode 100644 highly_cited_scholars/highly_cited_scholars/models/__init__.py create mode 100644 highly_cited_scholars/highly_cited_scholars/models/cnki_model.py create mode 100644 highly_cited_scholars/highly_cited_scholars/pipelines.py create mode 100644 highly_cited_scholars/highly_cited_scholars/settings.py create mode 100644 highly_cited_scholars/highly_cited_scholars/spiders/__init__.py create mode 100644 highly_cited_scholars/highly_cited_scholars/spiders/cnki_hcs_list2024.py create mode 100644 highly_cited_scholars/run.py create mode 100644 highly_cited_scholars/scrapy.cfg create mode 100644 highly_cited_scholars/start_scripts/cnki_hcs.py diff --git a/highly_cited_scholars/highly_cited_scholars/__init__.py b/highly_cited_scholars/highly_cited_scholars/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/highly_cited_scholars/highly_cited_scholars/items.py b/highly_cited_scholars/highly_cited_scholars/items.py new file mode 100644 index 0000000..86d9e3f --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/items.py @@ -0,0 +1,22 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class HighlyCitedScholarsItem(scrapy.Item): + # define the fields for your item here like: + person_id = scrapy.Field() + sno = scrapy.Field() + scholar_name = scrapy.Field() + scholar_link = scrapy.Field() + organization = scrapy.Field() + subject = scrapy.Field() + + +class CNKIHighlyCitedScholarItem(HighlyCitedScholarsItem): + certified_tag = scrapy.Field() + hc_type = scrapy.Field() + source = scrapy.Field() diff --git a/highly_cited_scholars/highly_cited_scholars/middlewares.py b/highly_cited_scholars/highly_cited_scholars/middlewares.py new file mode 100644 index 0000000..4e9f780 --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/middlewares.py @@ -0,0 +1,100 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class HighlyCitedScholarsSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + async def process_start(self, start): + # Called with an async iterator over the spider start() method or the + # maching method of an earlier spider middleware. + async for item_or_request in start: + yield item_or_request + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class HighlyCitedScholarsDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/highly_cited_scholars/highly_cited_scholars/models/__init__.py b/highly_cited_scholars/highly_cited_scholars/models/__init__.py new file mode 100644 index 0000000..4bf8839 --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/models/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/9/24 15:50 +# @Author : zhaoxiangpeng +# @File : __init__.py.py diff --git a/highly_cited_scholars/highly_cited_scholars/models/cnki_model.py b/highly_cited_scholars/highly_cited_scholars/models/cnki_model.py new file mode 100644 index 0000000..4f9cbe1 --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/models/cnki_model.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +# @Time : 2025/9/24 15:50 +# @Author : zhaoxiangpeng +# @File : cnki_model.py + +def hcs_data_list(page: int | str = 1, **kwargs): + o = dict( + Author="", + Organ="", + SubjectCode="", + ParentCode="", + Type="", + code="1", + IsRz="-1", + ) + if page != 1: + o["PageIndex"] = str(page) + + return o diff --git a/highly_cited_scholars/highly_cited_scholars/pipelines.py b/highly_cited_scholars/highly_cited_scholars/pipelines.py new file mode 100644 index 0000000..5f4d66c --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/pipelines.py @@ -0,0 +1,40 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +import pymongo + + +class HighlyCitedScholarsPipeline: + def process_item(self, item, spider): + return item + + +class HCS2DBPipeline: + collection_name = "data_hcs" + + def __init__(self, mongo_uri, mongo_db): + self.mongo_uri = mongo_uri + self.mongo_db = mongo_db + + @classmethod + def from_crawler(cls, crawler): + return cls( + mongo_uri=crawler.settings.get("MONGO_URI"), + mongo_db=crawler.settings.get("MONGO_DATABASE", "items"), + ) + + def open_spider(self, spider): + self.client = pymongo.MongoClient(self.mongo_uri) + self.db = self.client[self.mongo_db] + + def close_spider(self, spider): + self.client.close() + + def process_item(self, item, spider): + self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) + return item diff --git a/highly_cited_scholars/highly_cited_scholars/settings.py b/highly_cited_scholars/highly_cited_scholars/settings.py new file mode 100644 index 0000000..c4fb063 --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/settings.py @@ -0,0 +1,95 @@ +# Scrapy settings for highly_cited_scholars project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "highly_cited_scholars" + +SPIDER_MODULES = ["highly_cited_scholars.spiders"] +NEWSPIDER_MODULE = "highly_cited_scholars.spiders" + +ADDONS = {} + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "highly_cited_scholars (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Concurrency and throttling settings +#CONCURRENT_REQUESTS = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 +DOWNLOAD_DELAY = 5 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "highly_cited_scholars.middlewares.HighlyCitedScholarsSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +RETRY_ENABLED = True +RETRY_TIMES = 2 # 重试3次 +RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 400, 403, 404] # 增加了一些常见的错误码 +#DOWNLOADER_MIDDLEWARES = { +# "highly_cited_scholars.middlewares.HighlyCitedScholarsDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# "highly_cited_scholars.pipelines.HighlyCitedScholarsPipeline": 300, +#} + +MONGO_URI = "mongodb://root:123456@192.168.1.211:27017/" +MONGO_DATABASE = "science2" + +REDIS_URL = 'redis://:kcidea1509@192.168.1.211:6379/10' + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +FEED_EXPORT_ENCODING = "utf-8" diff --git a/highly_cited_scholars/highly_cited_scholars/spiders/__init__.py b/highly_cited_scholars/highly_cited_scholars/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/highly_cited_scholars/highly_cited_scholars/spiders/cnki_hcs_list2024.py b/highly_cited_scholars/highly_cited_scholars/spiders/cnki_hcs_list2024.py new file mode 100644 index 0000000..9bdaf09 --- /dev/null +++ b/highly_cited_scholars/highly_cited_scholars/spiders/cnki_hcs_list2024.py @@ -0,0 +1,64 @@ +# 注:该名单是按照中国知网高被引学者类型、教育部一级学科代码、作者姓名拼音排序 +import json +import logging + +import scrapy +from scrapy_redis.spiders import RedisSpider +from scrapy_redis.utils import bytes_to_str +from highly_cited_scholars.models import cnki_model as model +from highly_cited_scholars.items import CNKIHighlyCitedScholarItem + +logging.getLogger("pymongo").setLevel(logging.WARNING) + + +class CnkiHcsList2024Spider(RedisSpider): + name = "cnki_hcs_list2024" + allowed_domains = ["xs.cnki.net"] + start_urls = ["https://xs.cnki.net/List/HCS_List/"] + list_api = "https://xs.cnki.net/List/AjaxHcsDataList" + custom_settings = dict( + ITEM_PIPELINES={ + "highly_cited_scholars.pipelines.HCS2DBPipeline": 300, + }, + DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter", + SCHEDULER="scrapy_redis.scheduler.Scheduler", + ) + + def make_request_from_data(self, data): + data = bytes_to_str(data) + data = json.loads(data) + yield scrapy.FormRequest(url=self.list_api, method="POST", formdata=model.hcs_data_list(page=data["page"]), + meta=data) + + def parse(self, response): + tr_nodes = response.xpath(r'//*[@id="datatable"]/tbody/tr') + for tr_node in tr_nodes: + sno = tr_node.xpath(r'./td[1]/text()').get() # 排名 + hcs_name = tr_node.xpath(r'./td[2]/a/text()').get() # 学者名 + hcs_link = tr_node.xpath(r'./td[2]/a[@class="hcs_namelink"]/@href').get() + + certified_tag = tr_node.xpath(r'./td[2]/img').get() # 认证标记 + + organization = tr_node.xpath(r'./td[3]/text()').get() # 作者单位 + subject = tr_node.xpath(r'./td[4]/text()').get() # 学科 + hc_type = tr_node.xpath(r'./td[5]/div/span/text()').get() # 类型 + hcs_item = CNKIHighlyCitedScholarItem() + if hcs_link: + hcs_item['person_id'] = hcs_link.rsplit('/', maxsplit=1)[-1] # personId + hcs_item['scholar_link'] = response.urljoin(hcs_link) + hcs_item['sno'] = int(sno) + hcs_item['scholar_name'] = hcs_name + hcs_item['organization'] = organization + hcs_item['subject'] = subject + if certified_tag: + hcs_item['certified_tag'] = 1 + hcs_item['hc_type'] = hc_type + hcs_item['source'] = "xs.cnki.net" + yield hcs_item + + if response.meta.get("page") == 1: + # 获取页数最大值 + last_page = response.xpath(r'//*[@id="pagertable"]/a[text()="尾页"]/@data-page').get() + for i in range(2, int(last_page) + 1): + yield scrapy.FormRequest(url=self.list_api, method="POST", formdata=model.hcs_data_list(page=i), + priority=i) diff --git a/highly_cited_scholars/run.py b/highly_cited_scholars/run.py new file mode 100644 index 0000000..3fe1032 --- /dev/null +++ b/highly_cited_scholars/run.py @@ -0,0 +1,12 @@ +import os +import re +from scrapy.crawler import CrawlerProcess +from scrapy.settings import Settings +from scrapy.utils.project import get_project_settings + + +process = CrawlerProcess(get_project_settings()) + +process.crawl('cnki_hcs_list2024') +process.start() + diff --git a/highly_cited_scholars/scrapy.cfg b/highly_cited_scholars/scrapy.cfg new file mode 100644 index 0000000..15857b5 --- /dev/null +++ b/highly_cited_scholars/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = highly_cited_scholars.settings + +[deploy] +#url = http://localhost:6800/ +project = highly_cited_scholars diff --git a/highly_cited_scholars/start_scripts/cnki_hcs.py b/highly_cited_scholars/start_scripts/cnki_hcs.py new file mode 100644 index 0000000..0bdf4df --- /dev/null +++ b/highly_cited_scholars/start_scripts/cnki_hcs.py @@ -0,0 +1,14 @@ +import json +import redis + +from highly_cited_scholars.models import cnki_model as model +import highly_cited_scholars.settings as settings + + +def do_test(): + r = redis.StrictRedis.from_url(settings.REDIS_URL) + r.lpush("cnki_hcs_list2024:start_urls", json.dumps({"page": 1}, ensure_ascii=False)) + + +if __name__ == '__main__': + do_test()