123456789101112131415161718192021222324252627282930313233343536373839404142 |
- # -*- coding: utf-8 -*-
- import scrapy
- import os
- from manhua.items import mmonlyItem
- from scrapy.selector import Selector,HtmlXPathSelector
- from scrapy.http import Request
- from selenium import webdriver
- from scrapy_splash import SplashRequest
- import time
- class VerydmSpider(scrapy.Spider):
- name = 'verydm'
- # allowed_domains = ['verdm.com']
- base_urls = ['http://www.verydm.com/chapter.php?id=89596']
- start_urls = ['http://www.verydm.com/chapter.php?id=89596']
- page =0
- def parse(self, response):
- hxs = Selector(response=response).xpath('//img[@id="mainImage2"]/@src').extract_first()
- hxs_next = Selector(response=response).xpath('//select/option/@value').extract()
- for i in hxs_next:
- s = i.zfill(3)
- url = 'http://imgn1.magentozh.com:8090//h/hb08tnd/ch_1/0'+s+'.jpg'
- print(url)
- item = mmonlyItem()
- item['siteURL'] = url
- # print('111111',item['siteURL'])
- item['title'] = response.xpath('//a[re:test(@href, "http://www.verydm.com/manhua/huawuyu")]/text()').extract_first()
- # print('222222', item['title'])
- name = response.xpath('//div[@class="breadcrumbs"]/span/text()').extract_first()
- # print('333333', name)
- item['path'] = 'F:/tmp/' + item['title'] + '/' + name
- path = item['path']
- # print('333333', path)
- if not os.path.exists(path):
- os.makedirs(path)
- item['detailURL'] = url
- item['fileName'] = item['path'] + '/' + name + '/' + str(i) +'.jpg' # 拼接图片名称
- yield item
- time.sleep(10)
|