12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # -*- coding: utf-8 -*-
- import scrapy
- import os
- from manhua.items import mmonlyItem
- from scrapy.selector import Selector,HtmlXPathSelector
- from scrapy.http import Request
- from selenium import webdriver
- from scrapy_splash import SplashRequest
- class TianshangtianxiaSpider(scrapy.Spider):
- name = 'tianshangtianxia'
- allowed_domains = ['manhua.fzdm.com']
- base_url = ['https://manhua.fzdm.com/2/938/']
- start_urls = ['https://manhua.fzdm.com/2/938/']
- visit_page = set()
- visit_img = set()
- def parse(self, response):
- hxs = Selector(response=response).xpath('//a[re:test(@href, "index_\d+")]/@href').extract()
- # print(hxs)
- for url in hxs:
- # md5_url = self.md5(url)
- if url in self.visit_page:
- pass
- else:
- self.visit_page.add(url)
- # print(url)
- url = 'https://manhua.fzdm.com/2/938/' + url
- yield SplashRequest(url=url, callback=self.parse )
- for url_detail in self.visit_page:
- if url_detail in self.visit_img:
- pass
- else:
- self.visit_img.add(url_detail)
- url_detail = 'https://manhua.fzdm.com/2/938/' + url_detail
- # print(url_detail)
- yield SplashRequest(url=url_detail, callback=self.parse_detail,dont_filter=True)
- # def md5(self, url):
- # import hashlib
- # obj = hashlib.md5()
- # obj.update(bytes(url, encoding='utf-8'))
- # return obj.hexdigest()
- #
- def parse_detail(self, response):
- # print(response.text)
- hxs = Selector(response=response).xpath('//img[@id="mhpic"]/@src').extract()
- for url in hxs:
- print(url)
- item = mmonlyItem()
- item['siteURL'] = url
- item['title'] = response.xpath('//img[@id="mhpic"]/@alt').extract_first()
- item['path'] = 'F:/tmp/' + item['title']
- path = item['path']
- name = response.xpath('//h1/text()').extract_first()
- if not os.path.exists(path):
- os.makedirs(path)
- item['detailURL'] = url
- item['fileName'] = item['path'] + '/' + name + '.jpg' # 拼接图片名称
- yield item
|