@@ -0,0 +1,4 @@ |
||
| 1 |
+book/ |
|
| 2 |
+.DS_Store |
|
| 3 |
+node_modules/ |
|
| 4 |
+npm-debug.log |
@@ -0,0 +1,72 @@ |
||
| 1 |
+var Nightmare = require('nightmare');
|
|
| 2 |
+var nightmare = Nightmare({ show: true })
|
|
| 3 |
+ |
|
| 4 |
+var options = {
|
|
| 5 |
+ marginsType: 1, |
|
| 6 |
+ printBackground: false, |
|
| 7 |
+ printSelectionOnly: false, |
|
| 8 |
+ landscape: false, |
|
| 9 |
+ pageSize: "A4" |
|
| 10 |
+} |
|
| 11 |
+ |
|
| 12 |
+var start_link = process.argv[2]; |
|
| 13 |
+var start_page = parseInt(process.argv[3]); |
|
| 14 |
+var wait_time = parseInt(process.argv[4]); |
|
| 15 |
+ |
|
| 16 |
+if(start_link){
|
|
| 17 |
+ if(!start_page){
|
|
| 18 |
+ start_page = 1 |
|
| 19 |
+ } |
|
| 20 |
+ if(!wait_time){
|
|
| 21 |
+ wait_time = 2000 |
|
| 22 |
+ } |
|
| 23 |
+ nightmare |
|
| 24 |
+ .viewport(1200, 800) |
|
| 25 |
+ .goto('https://www.safaribooksonline.com/accounts/login/')
|
|
| 26 |
+ .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com')
|
|
| 27 |
+ .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512')
|
|
| 28 |
+ .click('form[action*="/accounts/login/"] [type=submit]')
|
|
| 29 |
+ .wait(1000) |
|
| 30 |
+ .then(function (result) {
|
|
| 31 |
+ var getPage = function(i, link){
|
|
| 32 |
+ console.log("Capturing page " + i + ":")
|
|
| 33 |
+ nightmare.viewport(1200, 800) |
|
| 34 |
+ .goto(link) |
|
| 35 |
+ .wait(1000) |
|
| 36 |
+ .evaluate(function (i) {
|
|
| 37 |
+ var link = $(".next.nav-link").attr('href');
|
|
| 38 |
+ var title = $('h1.title').text();
|
|
| 39 |
+ var $content = $('#sbo-rt-content').clone();
|
|
| 40 |
+ $('body').html($content);
|
|
| 41 |
+ $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>');
|
|
| 42 |
+ return [i, title, link] |
|
| 43 |
+ }, i) |
|
| 44 |
+ .then(function (result) {
|
|
| 45 |
+ var i = result[0] |
|
| 46 |
+ var title = result[1] |
|
| 47 |
+ var link = result[2] |
|
| 48 |
+ var options2 = {
|
|
| 49 |
+ marginsType: 1, |
|
| 50 |
+ printBackground: false, |
|
| 51 |
+ printSelectionOnly: false, |
|
| 52 |
+ landscape: false, |
|
| 53 |
+ pageSize: "A4" |
|
| 54 |
+ } |
|
| 55 |
+ console.log("> title: " + title);
|
|
| 56 |
+ console.log("> next: " + link);
|
|
| 57 |
+ nightmare.wait(wait_time) |
|
| 58 |
+ .pdf("book/" + i + "- " + title + ".pdf", options2)
|
|
| 59 |
+ if(link != null){
|
|
| 60 |
+ getPage(i + 1, "https://www.safaribooksonline.com" + link) |
|
| 61 |
+ } else {
|
|
| 62 |
+ nightmare.end(); |
|
| 63 |
+ } |
|
| 64 |
+ |
|
| 65 |
+ }); |
|
| 66 |
+ } |
|
| 67 |
+ getPage(start_page, start_link); |
|
| 68 |
+ //console.log("Found " + result.length + " links")
|
|
| 69 |
+ }) |
|
| 70 |
+} else {
|
|
| 71 |
+ console.log("Please provide a link for the scraping starting point.")
|
|
| 72 |
+} |
@@ -0,0 +1,14 @@ |
||
| 1 |
+{
|
|
| 2 |
+ "name": "betabot-scraper", |
|
| 3 |
+ "version": "0.0.1", |
|
| 4 |
+ "description": "Web scraping scripts for the Betabot project", |
|
| 5 |
+ "main": "index.js", |
|
| 6 |
+ "scripts": {
|
|
| 7 |
+ "test": "echo \"Error: no test specified yet\" && exit 1" |
|
| 8 |
+ }, |
|
| 9 |
+ "author": "James Peret", |
|
| 10 |
+ "license": "ISC", |
|
| 11 |
+ "dependencies": {
|
|
| 12 |
+ "nightmare": "^2.4.1" |
|
| 13 |
+ } |
|
| 14 |
+} |
@@ -0,0 +1,18 @@ |
||
| 1 |
+var Nightmare = require('nightmare');
|
|
| 2 |
+var nightmare = Nightmare({ show: true })
|
|
| 3 |
+ |
|
| 4 |
+nightmare |
|
| 5 |
+ .goto('http://yahoo.com')
|
|
| 6 |
+ .type('form[action*="/search"] [name=p]', 'github nightmare')
|
|
| 7 |
+ .click('form[action*="/search"] [type=submit]')
|
|
| 8 |
+ .wait('#main')
|
|
| 9 |
+ .evaluate(function () {
|
|
| 10 |
+ return document.querySelector('#main .searchCenterMiddle li a').href
|
|
| 11 |
+ }) |
|
| 12 |
+ .end() |
|
| 13 |
+ .then(function (result) {
|
|
| 14 |
+ console.log(result) |
|
| 15 |
+ }) |
|
| 16 |
+ .catch(function (error) {
|
|
| 17 |
+ console.error('Search failed:', error);
|
|
| 18 |
+ }); |
@@ -0,0 +1,68 @@ |
||
| 1 |
+var Nightmare = require('nightmare');
|
|
| 2 |
+var nightmare = Nightmare({ show: true })
|
|
| 3 |
+ |
|
| 4 |
+var options = {
|
|
| 5 |
+ marginsType: 1, |
|
| 6 |
+ printBackground: false, |
|
| 7 |
+ printSelectionOnly: false, |
|
| 8 |
+ landscape: false, |
|
| 9 |
+ pageSize: "A4" |
|
| 10 |
+} |
|
| 11 |
+ |
|
| 12 |
+ |
|
| 13 |
+nightmare |
|
| 14 |
+ .viewport(1200, 800) |
|
| 15 |
+ .goto('https://www.safaribooksonline.com/accounts/login/')
|
|
| 16 |
+ .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com')
|
|
| 17 |
+ .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512')
|
|
| 18 |
+ .click('form[action*="/accounts/login/"] [type=submit]')
|
|
| 19 |
+ .wait(1000) |
|
| 20 |
+ .goto('https://www.safaribooksonline.com/library/view/learning-robotics-using/9781783287536/')
|
|
| 21 |
+ .wait(100) |
|
| 22 |
+ .pdf('index.pdf', options)
|
|
| 23 |
+ .evaluate(function () {
|
|
| 24 |
+ var links = []; |
|
| 25 |
+ $(".detail-toc li ol").each(function(){
|
|
| 26 |
+ var a_href = $(this).find('a').attr('href');
|
|
| 27 |
+ links.push(a_href); |
|
| 28 |
+ }); |
|
| 29 |
+ toc_links = links |
|
| 30 |
+ return links; |
|
| 31 |
+ }) |
|
| 32 |
+ .then(function (result) {
|
|
| 33 |
+ console.log("Found " + result.length + " links.")
|
|
| 34 |
+ var getPage = function(i){
|
|
| 35 |
+ if(i < result.length){
|
|
| 36 |
+ console.log("Capturing page " + i + ":")
|
|
| 37 |
+ console.log(result[i]) |
|
| 38 |
+ nightmare.viewport(1200, 800) |
|
| 39 |
+ .goto('https://www.safaribooksonline.com' + result[i])
|
|
| 40 |
+ .wait(1000) |
|
| 41 |
+ .evaluate(function (i) {
|
|
| 42 |
+ console.log("> Copying content to body")
|
|
| 43 |
+ var $content = $('#sbo-rt-content').clone();
|
|
| 44 |
+ $('body').html($content);
|
|
| 45 |
+ console.log("> Changing print CSS")
|
|
| 46 |
+ $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>');
|
|
| 47 |
+ var title = $('h1.title').text();
|
|
| 48 |
+ return [i, title] |
|
| 49 |
+ }, i) |
|
| 50 |
+ .then(function (result) {
|
|
| 51 |
+ i = result[0] |
|
| 52 |
+ var options2 = {
|
|
| 53 |
+ marginsType: 1, |
|
| 54 |
+ printBackground: false, |
|
| 55 |
+ printSelectionOnly: false, |
|
| 56 |
+ landscape: false, |
|
| 57 |
+ pageSize: "A4" |
|
| 58 |
+ } |
|
| 59 |
+ console.log(result[1]); |
|
| 60 |
+ nightmare.pdf(i + "- " + result[1] + ".pdf", options2) |
|
| 61 |
+ .wait(2000) |
|
| 62 |
+ getPage(i + 1) |
|
| 63 |
+ }); |
|
| 64 |
+ } |
|
| 65 |
+ } |
|
| 66 |
+ getPage(1); |
|
| 67 |
+ //console.log("Found " + result.length + " links")
|
|
| 68 |
+ }) |