kukukkk.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # -*- coding: utf-8 -*-
  2. import scrapy
  3. import os
  4. from manhua.items import mmonlyItem
  5. from scrapy.selector import Selector,HtmlXPathSelector
  6. from scrapy.http import Request
  7. from selenium import webdriver
  8. from scrapy_splash import SplashRequest
  9. class KukukkkSpider(scrapy.Spider):
  10. name = 'kukukkk'
  11. # allowed_domains = ['kukukkk.com']
  12. start_urls = ['https://m.kukukkk.com/comiclist/158/']
  13. title =0
  14. page =0
  15. retry = 0
  16. def parse(self,response):
  17. alldownload = 0
  18. hxs = Selector(response=response).xpath('//div[@id="list"]/li/a/@href').extract()
  19. hxs_title = Selector(response=response).xpath('//div[@id="list"]/li/a/text()').extract()
  20. if alldownload == 0:
  21. for url in hxs:
  22. # print(url_next)
  23. url_next = 'https://m.kukukkk.com/' + url
  24. self.start_urls =url_next
  25. yield SplashRequest(url=url_next, callback=self.parse_page,meta={'url':url,'title':hxs_title[self.title]})
  26. self.title = self.title + 1
  27. elif alldownload == 1:
  28. for url in hxs:
  29. if url == '/comiclist/158/19905/1.htm':
  30. url_next = 'https://m.kukukkk.com/' + url
  31. self.start_urls = url_next
  32. yield SplashRequest(url=url_next, callback=self.parse_page,meta={'url': url, 'title': hxs_title[self.title]})
  33. self.title = self.title + 1
  34. def parse_page(self, response):
  35. hxs = Selector(response=response).xpath('//ul[@class="subNav"]/li/text()').extract()
  36. page_total=int(hxs[1].split('/')[1] )
  37. # print(response.meta['url'])
  38. end_pos = response.meta['url'].rfind('/') - 1
  39. start_pos = response.meta['url'].rfind('/', 0, end_pos)
  40. url_s = response.meta['url'][start_pos+1:end_pos+1]
  41. # print('996', response.meta['url'])
  42. # print('997', start_pos)
  43. # print('998', end_pos)
  44. # print('999',url_s)
  45. # print('111',response.meta['title'])
  46. for i in range(1,page_total + 1):
  47. if int(i) >= 1:
  48. # print(url_s)
  49. url_next = 'https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(i)+'.htm'
  50. print(url_next,i,page_total)
  51. f = open('F:/tmp/record.txt')
  52. exit_download = 0
  53. for lines in f.readlines():
  54. # print(lines)
  55. if url_next == lines.strip():
  56. print('已下载',lines)
  57. exit_download = 1
  58. if exit_download == 0:
  59. yield SplashRequest(url=url_next, callback=self.parse_detail,meta={'url': url_s, 'title': response.meta['title'],'page':i})
  60. f = open('F:/tmp/record.txt','r+')
  61. f.read()
  62. f.write(url_next)
  63. f.write('\n')
  64. print('写入日志', url_next)
  65. # f.close()
  66. # else:
  67. # print(i)
  68. def parse_detail(self, response):
  69. url_s = response.meta['url']
  70. print('章节',url_s)
  71. title_s = response.meta['title']
  72. print('标题', title_s)
  73. page = response.meta['page']
  74. print('页码', page)
  75. filter_flag = '/comiclist/158/'+url_s+'/\d'
  76. print('过滤条件',filter_flag)
  77. # hxs_img = Selector(response=response).xpath('//span/img[re:test(@src, "http://m8.1whour.com/newkuku/2018/04/14/化物语_第05话/0\d+")]/@src').extract()
  78. hxs_img = Selector(response=response).xpath('//a[re:test(@href, "'+filter_flag+'")]/img/@src').extract()
  79. hxs_exit = Selector(response=response).xpath('//a[re:test(@href, "/exit/exit.htm")]/img/@src').extract()
  80. hxs = Selector(response=response).xpath('//a[re:test(@href, "'+filter_flag+'")]/@href').extract()
  81. print('图片地址',hxs_img)
  82. print('最后一页图片地址', hxs_exit)
  83. print('链接地址', hxs)
  84. if hxs_img :
  85. for url in hxs_img:
  86. # print(url)
  87. item = mmonlyItem()
  88. item['siteURL'] = url
  89. item['title'] = title_s
  90. item['path'] = 'F:/tmp/' + item['title']
  91. path = item['path']
  92. # name = response.xpath('//h1/text()').extract_first()
  93. if not os.path.exists(path):
  94. os.makedirs(path)
  95. item['detailURL'] = url
  96. item['fileName'] = item['path'] + '/' + str(page) + '.jpg' # 拼接图片名称
  97. yield item
  98. self.retry = 0
  99. elif hxs_exit:
  100. for url in hxs_exit:
  101. # print(url)
  102. item = mmonlyItem()
  103. item['siteURL'] = url
  104. item['title'] = title_s
  105. item['path'] = 'F:/tmp/' + item['title']
  106. path = item['path']
  107. # name = response.xpath('//h1/text()').extract_first()
  108. if not os.path.exists(path):
  109. os.makedirs(path)
  110. item['detailURL'] = url
  111. item['fileName'] = item['path'] + '/' + str(page) + '.jpg' # 拼接图片名称
  112. yield item
  113. self.retry = 0
  114. else:
  115. if self.retry <= 3:
  116. self.retry = self.retry + 1
  117. print('重新发送第'+str(self.retry)+'次','https://m.kukukkk.com/comiclist/158/' + url_s + '/' + str(page) + '.htm')
  118. yield SplashRequest(url='https://m.kukukkk.com/comiclist/158/' + url_s + '/' + str(page) + '.htm',callback=self.parse_detail,meta={'url': url_s, 'title': response.meta['title'], 'page': page})
  119. else:
  120. f = open('F:/tmp/fail.txt', 'r+')
  121. f.read()
  122. f.write('https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(page)+'.htm')
  123. f.write('\n')
  124. print('写入失败日志', 'https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(page)+'.htm')
  125. # yield SplashRequest(url=next_urls, callback=self.parse_detail)