转载

使用 slimerjs 抓取 DDos 保护的站点

本站文章除注明转载外，均为本站原创或者翻译。
本站文章欢迎各种形式的转载，但请18岁以上的转载者注明文章出处，尊重我的劳动，也尊重你的智商；
本站部分原创和翻译文章提供markdown格式源码，欢迎使用文章源码进行转载；
本文标题：使用 slimerjs 抓取 DDos 保护的站点
本文链接： http://zengrong.net/post/2366.htm

我准备在一个图片站上抓点图，但发现它启用了 DDos 保护。站点会首先显示一段文本：

This process is automatic. Your browser will redirect to your requested content shortly.

要求你等待几秒钟检测浏览器，然后通过 302 重定向跳转到正确的页面（当然，这个正确的页面地址依然没变）。

等待的过程表现在浏览器上是这样的：

使用 slimerjs 抓取 DDos 保护的站点

这个保护的详细说明在这里： CloudFlare advanced DDoS protection 。

让我们看看怎么来解决这个问题。

源码

打开页面看源码，可以看到这样的 javascript 代码：

(function(){ var a = function() {try{return !!window.addEventListener} catch(e) {return !1} }, b = function(b, c) {a() ? document.addEventListener("DOMContentLoaded", b, c) : document.attachEvent("onreadystatechange", b)}; b(function(){   var a = document.getElementById('cf-content');a.style.display = 'block';   setTimeout(function(){     var t,r,a,f, cgksoUW={"bRM":+((+!![]+[])+(!+[]+!![]))};     t = document.createElement('div');     t.innerHTML="<a href='/'>x</a>";     t = t.firstChild.href;r = t.match(/https?://///)[0];     t = t.substr(r.length); t = t.substr(0,t.length-1);     a = document.getElementById('jschl-answer');     f = document.getElementById('challenge-form');     ;cgksoUW.bRM-=+((!+[]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]));cgksoUW.bRM*=+((!+[]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]));cgksoUW.bRM*=+((!+[]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]));cgksoUW.bRM+=+((!+[]+!![]+[])+(!+[]+!![]));cgksoUW.bRM*=+((!+[]+!![]+!![]+[])+(+[]));a.value = parseInt(cgksoUW.bRM, 10) + t.length;     f.submit();   }, 5000); }, false); })();

这是 JS 中出现的几个相关 Element：

<p data-translate="process_is_automatic">This process is automatic. Your browser will redirect to your requested content shortly.</p> <p data-translate="allow_5_secs">Please allow up to 5 seconds…</p> </div> <form id="challenge-form" action="/cdn-cgi/l/chk_jschl" method="get"> <input type="hidden" name="jschl_vc" value="c841963d655c6bb040c4ef02fab8d5a3"/> <input type="hidden" name="pass" value="1441270501.274-WM96e3sQx+"/> <input type="hidden" id="jschl-answer" name="jschl_answer"/> </form>

分析

上面代码的作用，主要是原来判断访问站点的客户端是否是一个真实的浏览器。

访问网站主页的时候，首先得到的是一个 503 错误，显示上面的页面内容。然后等待 5 秒，让用户看清楚提示信息，然后再根据得到的随机值东拼西凑计算出另一个值，写入 jschl-answer 中，最后通过 challenge-form 这个表单提交。

提交成功之后， /cdn-cgi/l/chk_jschl 会使用 302 重定向返回真实的网页内容。

cgksoUW 是一个随机的名称， pass 那个 input 中包含的值也是一个变化的值。每次进入这个页面，这两个值都会变化。

那么，pass 的值在提交的时候到底是如何计算出来的？

从 f = document.getElementById('challenge-form'); 下面的那段代码可以看出，这个计算无非是对 cgksoUW.bRM 进行一些加乘运算。然后再转成一个 Int 进行提交。关键在于那些看起来莫名其妙的加号、括号和方括号的集合的含义为何？

这是一个简单的障眼法。

让我们看看 cgksoUW.bRM 的初始值，这个表达式的结果等于数字 12 ：

+((+!![]+[])+(!+[]+!![]))

我们把上面的代码拆成几个子表达式，就好理解了。

首先是第一个嵌套括号中的： (+!![]+[]) ，打开浏览器的 console ,在其中输入 +!![] 回车，可以得到数字 1 。

先不管哪个加号，单独看 !![] 的值，它是布尔值 true 。这是因为 [] 返回的是一个有效的 Array，代表 true ，两次取反，依然是 true 。

而左边的那个加号由于没有提供左操作数，因此它的含义是正数，这里做了一次数字转换，自动把 true 转换成了 1 。

接着往后算，后面是个加号，然后是个 [] 。空的数组，默认是作为字符串处理的。 [].toString() 的值为空字符串。前面的 1 加上后面的空字符串，得到一个字符串 "1" 。

知道了原理，可以使用同样的方式算出第二个嵌套括号中的值为数字 2 。

接下来是字符串 "1" 加上数字 2，得到字符串 "12" 。

最左边的加号又做了一次转换，将字符串 "12" 转换成了数字 12 。

所以，这一段看似乱码的代码，只是为了获取一些随机的数字，便于和服务器验证罢了。我判断这些数值都是有意义的，而且和服务器时间关联，用来判断访问网站的是不是标准的浏览器。

选择

分析完之后，就要进行技术的选择了。

写爬虫自然是首选 Python 。开始，我准备使用 requests 来模拟请求，使用 BeautifulSoup4 来解析得到的 HTML，然后进行资源的下载。

但 requests 并不擅于模拟浏览器的行为，也不能执行 Javascript，还需要用 PyV8 等库来执行上面的数据计算，得到最终的 pass 。

PyExecJS 可以实现在 Python 环境中调用 Javascript 引擎，支持的引擎有 PyV8 ， Node.js 等等。

成功跳过 DDos 防护之后，还需要保持 cookies，构建 heads 。

既然计算部分要依赖别的引擎解析 Javascript ，那还不如直接使用 Javascript 来写好了。

这就是 Headless Browser 大显身手的时候了。

Headless Browser

Headless Browser ，通俗地来说就是个无界面且完整功能的浏览器。目前最流行的 Headless Browser 框架非 PhantomJS 莫属。它是基于 WebKit 开发的，想要整个网页截图神马的（再长也不怕！）用它就最好了。

不过，用 PhantomJS 保存网页中的图片还真是有点麻烦。由于上面的 DDOS 防护的存在，每个单独的图像文件依然存在防护，所以不能拿到地址后直接下载，必须使用浏览器来浏览这个图片。

于是我选择了另一个框架 SlimerJS ，它和 PhantomJS 功能基本一致，API 也基本一致，所不同的就是它使用 Mozilla Firefox 的 Gecko 引擎。

SlimerJS 的方便之处就在于它的 OnResourceReceived 回调支持 body 这个属性，其中包含正在访问的页面资源（页面中的 CSS、JS、Image 或者页面本身都算资源）的内容。我只需要简单地把它写入到文件中就能得到我需要的图片了。就像这样：

function onResourceReceived(response) {  if(state != 'image' || response.stage!="end" || response.stage == "fail" || !response.bodySize) return;  console.log('[onResourceReceived] id: '+response.id + ' starge:' + response.stage + ' contentType:' + response.contentType+' url:'+response.url);  var curGalleryObj = galleryList[curGallery];  var curImageObj = curGalleryImages[curImage];  if(!fs.exists(curGalleryObj.dir))  {   fs.makeTree(curGalleryObj.dir);  }  var fname = curGalleryObj.dir+"/"+curImage+'.'+curImageObj.ext;  curImageObj.file = fname;  curImageObj.done = true;  fs.write(fname, response.body, 'b'); }

下载流程

下面我挑选下载流程中的一些比较重要的实现贴出来，其实很简单的啦。

function getGalleryList(htmlFile, callback) {  //htmlFile 是个本地的 HTML 文件，这是为了方便  var file = htmlDir + '/' + htmlFile + '.html';  var p = WebPage.create();  //不解析这个文件中的javascript 因为有些 js 保存在 google 的服务器上，下载很慢  p.settings.javascriptEnabled = false;  //不再如这个文件中的图像，原因同上  p.settings.loadImages = false;  p.open(file, function(status)    {     //在页面中调用 DOM 函数或者 DOM 列表     var list = p.evaluate(function()      {       return document.querySelectorAll('.content .gallery-list div a');     var newList = [];     //根据列表构建需要下载的 Gallery 列表     for(var i=0;i<list.length;i++)     {      var href = list[i].href.slice(8);      var arr = href.split('/');      var obj = {};      obj.url = href;      obj.id = arr[1];      obj.name = arr[2].split('.')[0];      obj.dir = imageDir+'/['+obj.id+']'+obj.name;      obj.index = site + '/images/' + obj.id + '/' + obj.name + '.html';      obj.progress = obj.dir + '/progress.json';      newList[i] = obj;     }     callback(newList);    }); } function onIndexOpen(status) {  console.log('[onIndexOpen]');  state = 'index';  var intervalId = null;  var t = 0;  var _checkTimeout = function()  {   if(t > 60)   {    console.log('!!!!Timeout!!!!');    clearInterval(intervalId);    slimer.exit();   }  }  var _checkTitle = function()  {   console.log('[_checkTitle]');   console.log('['+t+'] Check Title...'+page.title);   t++;   _checkTimeout();   //若出现正确的标题则代表载入成功   if(page.title.indexOf('Welcome') == 0)   {    clearInterval(intervalId);    t = 0;    page.close();    openGallery();   }  }  intervalId = setInterval(_checkTitle, 1000); } function openGallery() {  console.log('[openGallery '+curGallery+']');  if(curGallery >= galleryList.length)  {   console.log('==== Download all Gallery Done! ====');   slimer.exit();  }  curImage = 0;  var curGalleryObj = galleryList[curGallery];  //本地已有列表，不必再获取  if(fs.exists(curGalleryObj.progress))  {   curGalleryImages = JSON.parse(fs.read(curGalleryObj.progress, 'r')).images;   //本地已经有这个文件了（上次下载成功），下一个   while(curGalleryImages[curImage] && curGalleryImages[curImage].done)   {    console.log('The file '+curGalleryImages[curImage].file + ' is downloaded.');    curImage++;   }   //开始下载文件   downloadImage();  }  else  {   state = 'gallery';   page.open(curGalleryObj.index, onGalleryOpen);  } } function onGalleryOpen(status) {  console.log('[onGalleryOpen '+curGallery+']');  //script 标签中保存了 JSON 格式的图像列表，直接作为字符串获取它们  //这样就不必翻页分析 DOM 了  var script = page.evaluate(function()    {     return document.querySelector('body > div.outer-wrapper.image-page > div.page-wrapper.page-wrapper-full > script');    });  if(script)  {   var re = //"images/":(.+/])/}/);/i;   var images = script.innerHTML.match(re)[1];   curGalleryImages = JSON.parse(images);   curGalleryImages.forEach(function(e,i,a)     {      var name = e.f.split('.');      a[i].ext = name[1];      a[i].name = name[0];     });   curImage = 0;   downloadImage();  }  else  {   console.log('NO SCRIPT');   slimer.exit();  } } function downloadImage() {  if(curImage >= curGalleryImages.length)  {   var curGalleryObj = galleryList[curGallery];   console.log('==== download '+curGalleryObj.dir +' has done. ====');   //下载下一个 gallery   curGallery ++;   openGallery();   return;  }  state = 'image';  var image = getCurImage();  console.log('[downloadImage]'+image);  //关闭当前的 page  page.close();  //重用资源  page.open(image); } // 检测文件是否是标准的 JPEG function checkJPEG(path) {  console.log('[checkJPEG '+path+ ' '+fs.exists(path)+']');  if(fs.exists(path))  {   var content = fs.read(path, 'rb');   if( content &&    //content.length > 10240 &&    content.charCodeAt(0) == 0xff &&    content.charCodeAt(1) == 0xd8 &&    content.charCodeAt(2) == 0xff &&    content.charCodeAt(3) == 0xe0)   {    return true;   }  }  return false; } function onLoadFinished(status, url, isFrame) {  loading = false;  console.log('==== ['+state+']onLoadFinished Loading page('+url+') '+ status + " loading:"+loading);  if(state == 'image')  {   var curGalleryObj = galleryList[curGallery];   var curImageObj = curGalleryImages[curImage];   var progressObj = {'gallery':curGalleryObj, 'images':curGalleryImages};   // 判断 JPEG 文件头，看看是否需要重新下载   if(checkJPEG(curImageObj.file))   {    curImageObj.done = true;    curImage++;   }   else   {    curImageObj.done = false;   }   // 将当前的下载进度写入到配置文件中，以便一次下载不完，   // 或异常退出后。下次下载时可以知道进度   fs.write(curGalleryObj.progress, JSON.stringify(progressObj), 'w');   //下载下一个文件，或者重新下载   downloadImage();  } } function onLoadStarted() {  loading = true;  var currentUrl = page.evaluate(function() {   return window.location.href;  });  console.log('==== ['+state+']onLoadStarted Current page(' + currentUrl + ') will gone. loading:'+loading); }; function onUrlChanged(targetUrl) {  console.log('=== ['+state+']onUrlChanged New URL: ' + targetUrl+' loading:'+loading); } var WebPage = require('webpage'); var page = WebPage.create(); var loading = false; var imageDir = 'images'; var htmlDir = 'html'; var site = 'http://examples.com'; var fs = require('fs'); var page = WebPage.create(); var galleryList = null; var curGallery = null; var curGalleryImages = null; var curImage = 0; var state = null; page.onLoadStarted = onLoadStarted; page.onLoadFinished = onLoadFinished; page.onUrlChanged = onUrlChanged page.onResourceReceived = onResourceReceived; getGalleryList('favorites_0', function(list)   {    galleryList = list;    curGallery = 1;    page.open(site, onIndexOpen);   });