javascript - Scraping React Site with Phantomjs -


i scraping website using react components, using phantomjs in nodejs.

with this: https://github.com/amir20/phantomjs-node

here code:

phantom.create().then(ph => {     _ph = ph;     return _ph.createpage(); }).then(page => {     _page = page;     return _page.open(url); }).then(status => {     return _page.property('content'); }).then(content => {     console.log(content);     _page.close();     _ph.exit(); }).catch(e => console.log(e)); 

problem react content not rendered, says: <!-- react-empty: 1 -->" actual react component should loaded.

how can scrap rendered react component? switched pure node-request solution phantomjs fix stuck.


update:

so dont have real solution yet. switched nightmarejs (https://github.com/segmentio/nightmare) has nice .wait('.some-selector') function, waits till specified selector loaded. fixed problems dynamically loaded react components.

i think should wait rendering react elements on page after page loaded. example of such waiting-function, using q promises, below. function returns promise , checks page state every 50ms. if required page state reached, function resolves promise. in case of timeout, function rejects promise.

var phantom = require('phantom'); var q = require('q'); var _ph, _page, _outobj; var url = 'https://tech.yandex.ru/maps/jsbox/';  phantom.create().then(ph => {     _ph = ph;     return _ph.createpage(); }).then(page => {     _page = page;     return _page.open(url); }).then(status => {     console.log(status);     return waitstate(textpopulated, 3); }).then(() => {     return _page.property('content'); }).then(content => {     console.log(content); _page.close(); _ph.exit(); }).catch(e => console.log(e));  function textpopulated() {     return _page.evaluate(function() {         var layer = document.queryselector('.ace_text-layer');         return layer && layer.childelementcount;     }).then(function(childelementcount) {         console.log('childelementcount: ' + childelementcount);         return childelementcount > 0;     }); }  function waitstate(state, timeout) {  // timeout in seconds optional     console.log('start waiting state: ' + state.name);      var limittime = timeout * 1000 || 20000;     var starttime = new date();      return wait();      function wait() {         return state().then(function(result) {             if (result) {                 console.log('reached state: ' + state.name);                 return;             } else if (new date() - starttime > limittime) {                 var errormessage = 'timeout state: ' + state.name;                 console.log(errormessage);                 throw new error(errormessage);             } else {                 return q.delay(50).then(wait);             }         }).catch(function(error) {             throw error;         });     } } 

Comments

Popular posts from this blog

java - SSE Emitter : Manage timeouts and complete() -

jquery - uncaught exception: DataTables Editor - remote hosting of code not allowed -

java - How to resolve error - package com.squareup.okhttp3 doesn't exist? -