javascript - Scraping React Site with Phantomjs -
i scraping website using react components, using phantomjs in nodejs.
with this: https://github.com/amir20/phantomjs-node
here code:
phantom.create().then(ph => { _ph = ph; return _ph.createpage(); }).then(page => { _page = page; return _page.open(url); }).then(status => { return _page.property('content'); }).then(content => { console.log(content); _page.close(); _ph.exit(); }).catch(e => console.log(e));
problem react content not rendered, says: <!-- react-empty: 1 -->"
actual react component should loaded.
how can scrap rendered react component? switched pure node-request solution phantomjs fix stuck.
update:
so dont have real solution yet. switched nightmarejs (https://github.com/segmentio/nightmare) has nice .wait('.some-selector')
function, waits till specified selector loaded. fixed problems dynamically loaded react components.
i think should wait rendering react elements on page after page loaded. example of such waiting-function, using q promises, below. function returns promise , checks page state every 50ms. if required page state reached, function resolves promise. in case of timeout, function rejects promise.
var phantom = require('phantom'); var q = require('q'); var _ph, _page, _outobj; var url = 'https://tech.yandex.ru/maps/jsbox/'; phantom.create().then(ph => { _ph = ph; return _ph.createpage(); }).then(page => { _page = page; return _page.open(url); }).then(status => { console.log(status); return waitstate(textpopulated, 3); }).then(() => { return _page.property('content'); }).then(content => { console.log(content); _page.close(); _ph.exit(); }).catch(e => console.log(e)); function textpopulated() { return _page.evaluate(function() { var layer = document.queryselector('.ace_text-layer'); return layer && layer.childelementcount; }).then(function(childelementcount) { console.log('childelementcount: ' + childelementcount); return childelementcount > 0; }); } function waitstate(state, timeout) { // timeout in seconds optional console.log('start waiting state: ' + state.name); var limittime = timeout * 1000 || 20000; var starttime = new date(); return wait(); function wait() { return state().then(function(result) { if (result) { console.log('reached state: ' + state.name); return; } else if (new date() - starttime > limittime) { var errormessage = 'timeout state: ' + state.name; console.log(errormessage); throw new error(errormessage); } else { return q.delay(50).then(wait); } }).catch(function(error) { throw error; }); } }
Comments
Post a Comment