12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- import scrapy
- import os
- from manhua.items import mmonlyItem
- from scrapy.selector import Selector,HtmlXPathSelector
- from scrapy.http import Request
- from selenium import webdriver
- from scrapy_splash import SplashRequest
- class TianshangtianxiaSpider(scrapy.Spider):
- name = 'tianshangtianxia'
- allowed_domains = ['manhua.fzdm.com']
- base_url = ['https://manhua.fzdm.com/2/938/']
- start_urls = ['https://manhua.fzdm.com/2/938/']
- visit_page = set()
- visit_img = set()
- def parse(self, response):
- hxs = Selector(response=response).xpath('//a[re:test(@href, "index_\d+")]/@href').extract()
-
- for url in hxs:
-
- if url in self.visit_page:
- pass
- else:
- self.visit_page.add(url)
-
- url = 'https://manhua.fzdm.com/2/938/' + url
- yield SplashRequest(url=url, callback=self.parse )
- for url_detail in self.visit_page:
- if url_detail in self.visit_img:
- pass
- else:
- self.visit_img.add(url_detail)
- url_detail = 'https://manhua.fzdm.com/2/938/' + url_detail
-
- yield SplashRequest(url=url_detail, callback=self.parse_detail,dont_filter=True)
-
-
-
-
-
-
- def parse_detail(self, response):
-
- hxs = Selector(response=response).xpath('//img[@id="mhpic"]/@src').extract()
- for url in hxs:
- print(url)
- item = mmonlyItem()
- item['siteURL'] = url
- item['title'] = response.xpath('//img[@id="mhpic"]/@alt').extract_first()
- item['path'] = 'F:/tmp/' + item['title']
- path = item['path']
- name = response.xpath('//h1/text()').extract_first()
- if not os.path.exists(path):
- os.makedirs(path)
- item['detailURL'] = url
- item['fileName'] = item['path'] + '/' + name + '.jpg'
- yield item
|