@@ -0,0 +1,4 @@ |
||
1 |
+book/ |
|
2 |
+.DS_Store |
|
3 |
+node_modules/ |
|
4 |
+npm-debug.log |
@@ -0,0 +1,72 @@ |
||
1 |
+var Nightmare = require('nightmare'); |
|
2 |
+var nightmare = Nightmare({ show: true }) |
|
3 |
+ |
|
4 |
+var options = { |
|
5 |
+ marginsType: 1, |
|
6 |
+ printBackground: false, |
|
7 |
+ printSelectionOnly: false, |
|
8 |
+ landscape: false, |
|
9 |
+ pageSize: "A4" |
|
10 |
+} |
|
11 |
+ |
|
12 |
+var start_link = process.argv[2]; |
|
13 |
+var start_page = parseInt(process.argv[3]); |
|
14 |
+var wait_time = parseInt(process.argv[4]); |
|
15 |
+ |
|
16 |
+if(start_link){ |
|
17 |
+ if(!start_page){ |
|
18 |
+ start_page = 1 |
|
19 |
+ } |
|
20 |
+ if(!wait_time){ |
|
21 |
+ wait_time = 2000 |
|
22 |
+ } |
|
23 |
+ nightmare |
|
24 |
+ .viewport(1200, 800) |
|
25 |
+ .goto('https://www.safaribooksonline.com/accounts/login/') |
|
26 |
+ .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com') |
|
27 |
+ .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512') |
|
28 |
+ .click('form[action*="/accounts/login/"] [type=submit]') |
|
29 |
+ .wait(1000) |
|
30 |
+ .then(function (result) { |
|
31 |
+ var getPage = function(i, link){ |
|
32 |
+ console.log("Capturing page " + i + ":") |
|
33 |
+ nightmare.viewport(1200, 800) |
|
34 |
+ .goto(link) |
|
35 |
+ .wait(1000) |
|
36 |
+ .evaluate(function (i) { |
|
37 |
+ var link = $(".next.nav-link").attr('href'); |
|
38 |
+ var title = $('h1.title').text(); |
|
39 |
+ var $content = $('#sbo-rt-content').clone(); |
|
40 |
+ $('body').html($content); |
|
41 |
+ $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>'); |
|
42 |
+ return [i, title, link] |
|
43 |
+ }, i) |
|
44 |
+ .then(function (result) { |
|
45 |
+ var i = result[0] |
|
46 |
+ var title = result[1] |
|
47 |
+ var link = result[2] |
|
48 |
+ var options2 = { |
|
49 |
+ marginsType: 1, |
|
50 |
+ printBackground: false, |
|
51 |
+ printSelectionOnly: false, |
|
52 |
+ landscape: false, |
|
53 |
+ pageSize: "A4" |
|
54 |
+ } |
|
55 |
+ console.log("> title: " + title); |
|
56 |
+ console.log("> next: " + link); |
|
57 |
+ nightmare.wait(wait_time) |
|
58 |
+ .pdf("book/" + i + "- " + title + ".pdf", options2) |
|
59 |
+ if(link != null){ |
|
60 |
+ getPage(i + 1, "https://www.safaribooksonline.com" + link) |
|
61 |
+ } else { |
|
62 |
+ nightmare.end(); |
|
63 |
+ } |
|
64 |
+ |
|
65 |
+ }); |
|
66 |
+ } |
|
67 |
+ getPage(start_page, start_link); |
|
68 |
+ //console.log("Found " + result.length + " links") |
|
69 |
+ }) |
|
70 |
+} else { |
|
71 |
+ console.log("Please provide a link for the scraping starting point.") |
|
72 |
+} |
@@ -0,0 +1,14 @@ |
||
1 |
+{ |
|
2 |
+ "name": "betabot-scraper", |
|
3 |
+ "version": "0.0.1", |
|
4 |
+ "description": "Web scraping scripts for the Betabot project", |
|
5 |
+ "main": "index.js", |
|
6 |
+ "scripts": { |
|
7 |
+ "test": "echo \"Error: no test specified yet\" && exit 1" |
|
8 |
+ }, |
|
9 |
+ "author": "James Peret", |
|
10 |
+ "license": "ISC", |
|
11 |
+ "dependencies": { |
|
12 |
+ "nightmare": "^2.4.1" |
|
13 |
+ } |
|
14 |
+} |
@@ -0,0 +1,18 @@ |
||
1 |
+var Nightmare = require('nightmare'); |
|
2 |
+var nightmare = Nightmare({ show: true }) |
|
3 |
+ |
|
4 |
+nightmare |
|
5 |
+ .goto('http://yahoo.com') |
|
6 |
+ .type('form[action*="/search"] [name=p]', 'github nightmare') |
|
7 |
+ .click('form[action*="/search"] [type=submit]') |
|
8 |
+ .wait('#main') |
|
9 |
+ .evaluate(function () { |
|
10 |
+ return document.querySelector('#main .searchCenterMiddle li a').href |
|
11 |
+ }) |
|
12 |
+ .end() |
|
13 |
+ .then(function (result) { |
|
14 |
+ console.log(result) |
|
15 |
+ }) |
|
16 |
+ .catch(function (error) { |
|
17 |
+ console.error('Search failed:', error); |
|
18 |
+ }); |
@@ -0,0 +1,68 @@ |
||
1 |
+var Nightmare = require('nightmare'); |
|
2 |
+var nightmare = Nightmare({ show: true }) |
|
3 |
+ |
|
4 |
+var options = { |
|
5 |
+ marginsType: 1, |
|
6 |
+ printBackground: false, |
|
7 |
+ printSelectionOnly: false, |
|
8 |
+ landscape: false, |
|
9 |
+ pageSize: "A4" |
|
10 |
+} |
|
11 |
+ |
|
12 |
+ |
|
13 |
+nightmare |
|
14 |
+ .viewport(1200, 800) |
|
15 |
+ .goto('https://www.safaribooksonline.com/accounts/login/') |
|
16 |
+ .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com') |
|
17 |
+ .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512') |
|
18 |
+ .click('form[action*="/accounts/login/"] [type=submit]') |
|
19 |
+ .wait(1000) |
|
20 |
+ .goto('https://www.safaribooksonline.com/library/view/learning-robotics-using/9781783287536/') |
|
21 |
+ .wait(100) |
|
22 |
+ .pdf('index.pdf', options) |
|
23 |
+ .evaluate(function () { |
|
24 |
+ var links = []; |
|
25 |
+ $(".detail-toc li ol").each(function(){ |
|
26 |
+ var a_href = $(this).find('a').attr('href'); |
|
27 |
+ links.push(a_href); |
|
28 |
+ }); |
|
29 |
+ toc_links = links |
|
30 |
+ return links; |
|
31 |
+ }) |
|
32 |
+ .then(function (result) { |
|
33 |
+ console.log("Found " + result.length + " links.") |
|
34 |
+ var getPage = function(i){ |
|
35 |
+ if(i < result.length){ |
|
36 |
+ console.log("Capturing page " + i + ":") |
|
37 |
+ console.log(result[i]) |
|
38 |
+ nightmare.viewport(1200, 800) |
|
39 |
+ .goto('https://www.safaribooksonline.com' + result[i]) |
|
40 |
+ .wait(1000) |
|
41 |
+ .evaluate(function (i) { |
|
42 |
+ console.log("> Copying content to body") |
|
43 |
+ var $content = $('#sbo-rt-content').clone(); |
|
44 |
+ $('body').html($content); |
|
45 |
+ console.log("> Changing print CSS") |
|
46 |
+ $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>'); |
|
47 |
+ var title = $('h1.title').text(); |
|
48 |
+ return [i, title] |
|
49 |
+ }, i) |
|
50 |
+ .then(function (result) { |
|
51 |
+ i = result[0] |
|
52 |
+ var options2 = { |
|
53 |
+ marginsType: 1, |
|
54 |
+ printBackground: false, |
|
55 |
+ printSelectionOnly: false, |
|
56 |
+ landscape: false, |
|
57 |
+ pageSize: "A4" |
|
58 |
+ } |
|
59 |
+ console.log(result[1]); |
|
60 |
+ nightmare.pdf(i + "- " + result[1] + ".pdf", options2) |
|
61 |
+ .wait(2000) |
|
62 |
+ getPage(i + 1) |
|
63 |
+ }); |
|
64 |
+ } |
|
65 |
+ } |
|
66 |
+ getPage(1); |
|
67 |
+ //console.log("Found " + result.length + " links") |
|
68 |
+ }) |