123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- # -*- coding: utf-8 -*-
- import scrapy
- import os
- from manhua.items import mmonlyItem
- from scrapy.selector import Selector,HtmlXPathSelector
- from scrapy.http import Request
- from selenium import webdriver
- from scrapy_splash import SplashRequest
- class KukukkkSpider(scrapy.Spider):
- name = 'kukukkk'
- # allowed_domains = ['kukukkk.com']
- start_urls = ['https://m.kukukkk.com/comiclist/158/']
- title =0
- page =0
- retry = 0
- def parse(self,response):
- alldownload = 0
- hxs = Selector(response=response).xpath('//div[@id="list"]/li/a/@href').extract()
- hxs_title = Selector(response=response).xpath('//div[@id="list"]/li/a/text()').extract()
- if alldownload == 0:
- for url in hxs:
- # print(url_next)
- url_next = 'https://m.kukukkk.com/' + url
- self.start_urls =url_next
- yield SplashRequest(url=url_next, callback=self.parse_page,meta={'url':url,'title':hxs_title[self.title]})
- self.title = self.title + 1
- elif alldownload == 1:
- for url in hxs:
- if url == '/comiclist/158/19905/1.htm':
- url_next = 'https://m.kukukkk.com/' + url
- self.start_urls = url_next
- yield SplashRequest(url=url_next, callback=self.parse_page,meta={'url': url, 'title': hxs_title[self.title]})
- self.title = self.title + 1
- def parse_page(self, response):
- hxs = Selector(response=response).xpath('//ul[@class="subNav"]/li/text()').extract()
- page_total=int(hxs[1].split('/')[1] )
- # print(response.meta['url'])
- end_pos = response.meta['url'].rfind('/') - 1
- start_pos = response.meta['url'].rfind('/', 0, end_pos)
- url_s = response.meta['url'][start_pos+1:end_pos+1]
- # print('996', response.meta['url'])
- # print('997', start_pos)
- # print('998', end_pos)
- # print('999',url_s)
- # print('111',response.meta['title'])
- for i in range(1,page_total + 1):
- if int(i) >= 1:
- # print(url_s)
- url_next = 'https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(i)+'.htm'
- print(url_next,i,page_total)
- f = open('F:/tmp/record.txt')
- exit_download = 0
- for lines in f.readlines():
- # print(lines)
- if url_next == lines.strip():
- print('已下载',lines)
- exit_download = 1
- if exit_download == 0:
- yield SplashRequest(url=url_next, callback=self.parse_detail,meta={'url': url_s, 'title': response.meta['title'],'page':i})
- f = open('F:/tmp/record.txt','r+')
- f.read()
- f.write(url_next)
- f.write('\n')
- print('写入日志', url_next)
- # f.close()
- # else:
- # print(i)
- def parse_detail(self, response):
- url_s = response.meta['url']
- print('章节',url_s)
- title_s = response.meta['title']
- print('标题', title_s)
- page = response.meta['page']
- print('页码', page)
- filter_flag = '/comiclist/158/'+url_s+'/\d'
- print('过滤条件',filter_flag)
- # hxs_img = Selector(response=response).xpath('//span/img[re:test(@src, "http://m8.1whour.com/newkuku/2018/04/14/化物语_第05话/0\d+")]/@src').extract()
- hxs_img = Selector(response=response).xpath('//a[re:test(@href, "'+filter_flag+'")]/img/@src').extract()
- hxs_exit = Selector(response=response).xpath('//a[re:test(@href, "/exit/exit.htm")]/img/@src').extract()
- hxs = Selector(response=response).xpath('//a[re:test(@href, "'+filter_flag+'")]/@href').extract()
- print('图片地址',hxs_img)
- print('最后一页图片地址', hxs_exit)
- print('链接地址', hxs)
- if hxs_img :
- for url in hxs_img:
- # print(url)
- item = mmonlyItem()
- item['siteURL'] = url
- item['title'] = title_s
- item['path'] = 'F:/tmp/' + item['title']
- path = item['path']
- # name = response.xpath('//h1/text()').extract_first()
- if not os.path.exists(path):
- os.makedirs(path)
- item['detailURL'] = url
- item['fileName'] = item['path'] + '/' + str(page) + '.jpg' # 拼接图片名称
- yield item
- self.retry = 0
- elif hxs_exit:
- for url in hxs_exit:
- # print(url)
- item = mmonlyItem()
- item['siteURL'] = url
- item['title'] = title_s
- item['path'] = 'F:/tmp/' + item['title']
- path = item['path']
- # name = response.xpath('//h1/text()').extract_first()
- if not os.path.exists(path):
- os.makedirs(path)
- item['detailURL'] = url
- item['fileName'] = item['path'] + '/' + str(page) + '.jpg' # 拼接图片名称
- yield item
- self.retry = 0
- else:
- if self.retry <= 3:
- self.retry = self.retry + 1
- print('重新发送第'+str(self.retry)+'次','https://m.kukukkk.com/comiclist/158/' + url_s + '/' + str(page) + '.htm')
- yield SplashRequest(url='https://m.kukukkk.com/comiclist/158/' + url_s + '/' + str(page) + '.htm',callback=self.parse_detail,meta={'url': url_s, 'title': response.meta['title'], 'page': page})
- else:
- f = open('F:/tmp/fail.txt', 'r+')
- f.read()
- f.write('https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(page)+'.htm')
- f.write('\n')
- print('写入失败日志', 'https://m.kukukkk.com/comiclist/158/' + url_s +'/'+ str(page)+'.htm')
- # yield SplashRequest(url=next_urls, callback=self.parse_detail)
|