settings.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. # -*- coding: utf-8 -*-
  2. # Scrapy settings for manhua project
  3. #
  4. # For simplicity, this file contains only settings considered important or
  5. # commonly used. You can find more settings consulting the documentation:
  6. #
  7. # https://doc.scrapy.org/en/latest/topics/settings.html
  8. # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  9. # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  10. BOT_NAME = 'manhua'
  11. SPIDER_MODULES = ['manhua.spiders']
  12. NEWSPIDER_MODULE = 'manhua.spiders'
  13. # USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
  14. #
  15. # DOWNLOADER_MIDDLEWARES = {
  16. # 'manhua.middlewares.AreaSpiderMiddleware': 543,
  17. # }
  18. #
  19. # ITEM_PIPELINES = {
  20. # 'manhua.pipelines.AirHistoryPipeline': 300,
  21. # }
  22. # 渲染服务的url
  23. SPLASH_URL = 'http://192.168.99.100:8050'
  24. # 去重过滤器
  25. DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
  26. # 使用Splash的Http缓存
  27. HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
  28. SPIDER_MIDDLEWARES = {
  29. 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
  30. }
  31. #下载器中间件
  32. DOWNLOADER_MIDDLEWARES = {
  33. 'scrapy_splash.SplashCookiesMiddleware': 723,
  34. 'scrapy_splash.SplashMiddleware': 725,
  35. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
  36. }
  37. SPIDER_MIDDLEWARES = {
  38. 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
  39. }
  40. # 请求头
  41. DEFAULT_REQUEST_HEADERS = {
  42. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36',
  43. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  44. }
  45. # 管道
  46. # ITEM_PIPELINES = {
  47. # 'news.pipelines.NewsPipeline': 300,
  48. # }
  49. FEED_EXPORT_ENCODING = 'utf-8'
  50. ROBOTSTXT_OBEY = False
  51. # 默认是16,一次可以请求的最大次数
  52. CONCURRENT_REQUESTS = 1
  53. # 下载延迟
  54. # DOWNLOAD_DELAY = 0.1
  55. COOKIES_ENABLED = False
  56. DEFAULT_REQUEST_HEADERS = {
  57. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  58. 'Accept-Encoding': 'utf-8',
  59. 'Accept-Language': 'zh-CN,zh;q=0.8',
  60. 'Cache-Control': 'max-age=0',
  61. 'Connection': 'keep-alive',
  62. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
  63. ITEM_PIPELINES = {'manhua.pipelines.mmonlyPipeline': 100}
  64. # 日志级别
  65. LOG_LEVEL = 'INFO'
  66. LOG_FILE = '/tmp/log.txt'
  67. # Crawl responsibly by identifying yourself (and your website) on the user-agent
  68. #USER_AGENT = 'manhua (+http://www.yourdomain.com)'
  69. # Obey robots.txt rules
  70. ROBOTSTXT_OBEY = True
  71. # Configure maximum concurrent requests performed by Scrapy (default: 16)
  72. #CONCURRENT_REQUESTS = 32
  73. # Configure a delay for requests for the same website (default: 0)
  74. # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
  75. # See also autothrottle settings and docs
  76. #DOWNLOAD_DELAY = 3
  77. # The download delay setting will honor only one of:
  78. # CONCURRENT_REQUESTS_PER_DOMAIN = 16
  79. #CONCURRENT_REQUESTS_PER_IP = 16
  80. # Disable cookies (enabled by default)
  81. #COOKIES_ENABLED = False
  82. # Disable Telnet Console (enabled by default)
  83. #TELNETCONSOLE_ENABLED = False
  84. # Override the default request headers:
  85. #DEFAULT_REQUEST_HEADERS = {
  86. # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  87. # 'Accept-Language': 'en',
  88. #}
  89. # Enable or disable spider middlewares
  90. # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  91. #SPIDER_MIDDLEWARES = {
  92. # 'manhua.middlewares.ManhuaSpiderMiddleware': 543,
  93. #}
  94. # Enable or disable downloader middlewares
  95. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  96. #DOWNLOADER_MIDDLEWARES = {
  97. # 'manhua.middlewares.ManhuaDownloaderMiddleware': 543,
  98. #}
  99. # Enable or disable extensions
  100. # See https://doc.scrapy.org/en/latest/topics/extensions.html
  101. #EXTENSIONS = {
  102. # 'scrapy.extensions.telnet.TelnetConsole': None,
  103. #}
  104. # Configure item pipelines
  105. # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  106. #ITEM_PIPELINES = {
  107. # 'manhua.pipelines.ManhuaPipeline': 300,
  108. #}
  109. # Enable and configure the AutoThrottle extension (disabled by default)
  110. # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
  111. #AUTOTHROTTLE_ENABLED = True
  112. # The initial download delay
  113. #AUTOTHROTTLE_START_DELAY = 5
  114. # The maximum download delay to be set in case of high latencies
  115. #AUTOTHROTTLE_MAX_DELAY = 60
  116. # The average number of requests Scrapy should be sending in parallel to
  117. # each remote server
  118. #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  119. # Enable showing throttling stats for every response received:
  120. #AUTOTHROTTLE_DEBUG = False
  121. # Enable and configure HTTP caching (disabled by default)
  122. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  123. #HTTPCACHE_ENABLED = True
  124. #HTTPCACHE_EXPIRATION_SECS = 0
  125. #HTTPCACHE_DIR = 'httpcache'
  126. #HTTPCACHE_IGNORE_HTTP_CODES = []
  127. #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  128. # Enable or disable downloader middlewares
  129. # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
  130. # DEPTH_LIMIT = 100