cnki:add spider
parent
2bd56aeb10
commit
68306a03ab
@ -0,0 +1,40 @@
|
|||||||
|
from typing import AsyncIterator, Any
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
from science_article_cnki.models import cnki_model as model
|
||||||
|
from science_article_cnki.configs import cnki as config
|
||||||
|
|
||||||
|
|
||||||
|
class CnkiLatestIncrementSpider(scrapy.Spider):
|
||||||
|
name = "cnki_latest_increment"
|
||||||
|
custom_settings = dict(
|
||||||
|
DOWNLOADER_MIDDLEWARES={
|
||||||
|
"science_article_cnki.middlewares.CnkiSearchHeadersDownloaderMiddleware": 540,
|
||||||
|
},
|
||||||
|
ITEM_PIPELINES={
|
||||||
|
"science_article_cnki.pipelines.MongoPipeline": 300,
|
||||||
|
"science_article_cnki.pipelines.DupTodoPipeline": 310,
|
||||||
|
# "science_article_cnki.pipelines.verify_data.VerifyDataIntegrity": 400,
|
||||||
|
},
|
||||||
|
# LOG_LEVEL="INFO"
|
||||||
|
)
|
||||||
|
source = 'cnki'
|
||||||
|
resource_type: str = "学术期刊"
|
||||||
|
|
||||||
|
query_id: int
|
||||||
|
query: str
|
||||||
|
filters: list = list()
|
||||||
|
|
||||||
|
async def start(self) -> AsyncIterator[Any]:
|
||||||
|
m = dict(query=self.query, resource_type=self.resource_type, page=1)
|
||||||
|
m.update(filters=self.filters)
|
||||||
|
query_body = model.adv_refine_search(**m)
|
||||||
|
# 把筛选项加到查询体中
|
||||||
|
model.add_muti_filters(base_query=query_body, filters=m.get("filters"))
|
||||||
|
form_d = model.adv_query_search(query_body, **m)
|
||||||
|
yield scrapy.FormRequest(url=config.CNKI_ADV_SEARCH_API, method="POST",
|
||||||
|
formdata=form_d, meta=dict(REQUEST_Q=m))
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
||||||
Loading…
Reference in New Issue