tianshangtianxia.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import os
  4. from manhua.items import mmonlyItem
  5. from scrapy.selector import Selector,HtmlXPathSelector
  6. from scrapy.http import Request
  7. from selenium import webdriver
  8. from scrapy_splash import SplashRequest
  9. class TianshangtianxiaSpider(scrapy.Spider):
  10. name = 'tianshangtianxia'
  11. allowed_domains = ['manhua.fzdm.com']
  12. base_url = ['https://manhua.fzdm.com/2/938/']
  13. start_urls = ['https://manhua.fzdm.com/2/938/']
  14. visit_page = set()
  15. visit_img = set()
  16. def parse(self, response):
  17. hxs = Selector(response=response).xpath('//a[re:test(@href, "index_\d+")]/@href').extract()
  18. # print(hxs)
  19. for url in hxs:
  20. # md5_url = self.md5(url)
  21. if url in self.visit_page:
  22. pass
  23. else:
  24. self.visit_page.add(url)
  25. # print(url)
  26. url = 'https://manhua.fzdm.com/2/938/' + url
  27. yield SplashRequest(url=url, callback=self.parse )
  28. for url_detail in self.visit_page:
  29. if url_detail in self.visit_img:
  30. pass
  31. else:
  32. self.visit_img.add(url_detail)
  33. url_detail = 'https://manhua.fzdm.com/2/938/' + url_detail
  34. # print(url_detail)
  35. yield SplashRequest(url=url_detail, callback=self.parse_detail,dont_filter=True)
  36. # def md5(self, url):
  37. # import hashlib
  38. # obj = hashlib.md5()
  39. # obj.update(bytes(url, encoding='utf-8'))
  40. # return obj.hexdigest()
  41. #
  42. def parse_detail(self, response):
  43. # print(response.text)
  44. hxs = Selector(response=response).xpath('//img[@id="mhpic"]/@src').extract()
  45. for url in hxs:
  46. print(url)
  47. item = mmonlyItem()
  48. item['siteURL'] = url
  49. item['title'] = response.xpath('//img[@id="mhpic"]/@alt').extract_first()
  50. item['path'] = 'F:/tmp/' + item['title']
  51. path = item['path']
  52. name = response.xpath('//h1/text()').extract_first()
  53. if not os.path.exists(path):
  54. os.makedirs(path)
  55. item['detailURL'] = url
  56. item['fileName'] = item['path'] + '/' + name + '.jpg' # 拼接图片名称
  57. yield item