verydm.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import os
  4. from manhua.items import mmonlyItem
  5. from scrapy.selector import Selector,HtmlXPathSelector
  6. from scrapy.http import Request
  7. from selenium import webdriver
  8. from scrapy_splash import SplashRequest
  9. import time
  10. class VerydmSpider(scrapy.Spider):
  11. name = 'verydm'
  12. # allowed_domains = ['verdm.com']
  13. base_urls = ['http://www.verydm.com/chapter.php?id=89596']
  14. start_urls = ['http://www.verydm.com/chapter.php?id=89596']
  15. page =0
  16. def parse(self, response):
  17. hxs = Selector(response=response).xpath('//img[@id="mainImage2"]/@src').extract_first()
  18. hxs_next = Selector(response=response).xpath('//select/option/@value').extract()
  19. for i in hxs_next:
  20. s = i.zfill(3)
  21. url = 'http://imgn1.magentozh.com:8090//h/hb08tnd/ch_1/0'+s+'.jpg'
  22. print(url)
  23. item = mmonlyItem()
  24. item['siteURL'] = url
  25. # print('111111',item['siteURL'])
  26. item['title'] = response.xpath('//a[re:test(@href, "http://www.verydm.com/manhua/huawuyu")]/text()').extract_first()
  27. # print('222222', item['title'])
  28. name = response.xpath('//div[@class="breadcrumbs"]/span/text()').extract_first()
  29. # print('333333', name)
  30. item['path'] = 'F:/tmp/' + item['title'] + '/' + name
  31. path = item['path']
  32. # print('333333', path)
  33. if not os.path.exists(path):
  34. os.makedirs(path)
  35. item['detailURL'] = url
  36. item['fileName'] = item['path'] + '/' + name + '/' + str(i) +'.jpg' # 拼接图片名称
  37. yield item
  38. time.sleep(10)