A node web scraper that downloads books from Safaribooks.com

index.js 2.3KB

    var Nightmare = require('nightmare'); var nightmare = Nightmare({ show: true }) var options = { marginsType: 1, printBackground: false, printSelectionOnly: false, landscape: false, pageSize: "A4" } var start_link = process.argv[2]; var start_page = parseInt(process.argv[3]); var wait_time = parseInt(process.argv[4]); if(start_link){ if(!start_page){ start_page = 1 } if(!wait_time){ wait_time = 2000 } nightmare .viewport(1200, 800) .goto('https://www.safaribooksonline.com/accounts/login/') .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com') .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512') .click('form[action*="/accounts/login/"] [type=submit]') .wait(1000) .then(function (result) { var getPage = function(i, link){ console.log("Capturing page " + i + ":") nightmare.viewport(1200, 800) .goto(link) .wait(1000) .evaluate(function (i) { var link = $(".next.nav-link").attr('href'); var title = $('h1.title').text(); var $content = $('#sbo-rt-content').clone(); $('body').html($content); $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>'); return [i, title, link] }, i) .then(function (result) { var i = result[0] var title = result[1] var link = result[2] var options2 = { marginsType: 1, printBackground: false, printSelectionOnly: false, landscape: false, pageSize: "A4" } console.log("> title: " + title); console.log("> next: " + link); nightmare.wait(wait_time) .pdf("book/" + i + "- " + title + ".pdf", options2) if(link != null){ getPage(i + 1, "https://www.safaribooksonline.com" + link) } else { nightmare.end(); } }); } getPage(start_page, start_link); //console.log("Found " + result.length + " links") }) } else { console.log("Please provide a link for the scraping starting point.") }