cnki:add spider
parent
2bd56aeb10
commit
68306a03ab
@ -0,0 +1,40 @@
|
||||
from typing import AsyncIterator, Any
|
||||
|
||||
import scrapy
|
||||
|
||||
from science_article_cnki.models import cnki_model as model
|
||||
from science_article_cnki.configs import cnki as config
|
||||
|
||||
|
||||
class CnkiLatestIncrementSpider(scrapy.Spider):
|
||||
name = "cnki_latest_increment"
|
||||
custom_settings = dict(
|
||||
DOWNLOADER_MIDDLEWARES={
|
||||
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
|
||||
},
|
||||
ITEM_PIPELINES={
|
||||
"science_article_cnki.pipelines.MongoPipeline": 300,
|
||||
"science_article_cnki.pipelines.DupTodoPipeline": 310,
|
||||
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
|
||||
},
|
||||
# LOG_LEVEL="INFO"
|
||||
)
|
||||
source = 'cnki'
|
||||
resource_type: str = "学术期刊"
|
||||
|
||||
query_id: int
|
||||
query: str
|
||||
filters: list = list()
|
||||
|
||||
async def start(self) -> AsyncIterator[Any]:
|
||||
m = dict(query=self.query, resource_type=self.resource_type, page=1)
|
||||
m.update(filters=self.filters)
|
||||
query_body = model.adv_refine_search(**m)
|
||||
# 把筛选项加到查询体中
|
||||
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
|
||||
form_d = model.adv_query_search(query_body, **m)
|
||||
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
|
||||
formdata=form_d, meta=dict(REQUEST_Q=m))
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
Loading…
Reference in New Issue