Safari Bookstore scraper

James Peret 8 years ago
commit
be348cecc9
5 changed files with 176 additions and 0 deletions
  1. 4 0
      .gitignore
  2. 72 0
      index.js
  3. 14 0
      package.json
  4. 18 0
      test.js
  5. 68 0
      test2.js

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
1
+book/
2
+.DS_Store
3
+node_modules/
4
+npm-debug.log

+ 72 - 0
index.js

@@ -0,0 +1,72 @@
1
+var Nightmare = require('nightmare');
2
+var nightmare = Nightmare({ show: true })
3
+
4
+var options = {
5
+  marginsType: 1,
6
+  printBackground: false,
7
+  printSelectionOnly: false,
8
+  landscape: false,
9
+  pageSize: "A4"
10
+}
11
+
12
+var start_link = process.argv[2];
13
+var start_page = parseInt(process.argv[3]);
14
+var wait_time = parseInt(process.argv[4]);
15
+
16
+if(start_link){
17
+  if(!start_page){
18
+    start_page = 1
19
+  }
20
+  if(!wait_time){
21
+    wait_time = 2000
22
+  }
23
+  nightmare
24
+    .viewport(1200, 800)
25
+    .goto('https://www.safaribooksonline.com/accounts/login/')
26
+    .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com')
27
+    .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512')
28
+    .click('form[action*="/accounts/login/"] [type=submit]')
29
+    .wait(1000)
30
+    .then(function (result) {
31
+      var getPage = function(i, link){
32
+        console.log("Capturing page " + i + ":")
33
+        nightmare.viewport(1200, 800)
34
+        .goto(link)
35
+        .wait(1000)
36
+        .evaluate(function (i) {
37
+          var link = $(".next.nav-link").attr('href');
38
+          var title = $('h1.title').text();
39
+          var $content = $('#sbo-rt-content').clone();
40
+          $('body').html($content);
41
+          $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>');
42
+          return [i, title, link]
43
+        }, i)
44
+        .then(function (result) {
45
+          var i = result[0]
46
+          var title = result[1]
47
+          var link = result[2]
48
+          var options2 = {
49
+            marginsType: 1,
50
+            printBackground: false,
51
+            printSelectionOnly: false,
52
+            landscape: false,
53
+            pageSize: "A4"
54
+          }
55
+          console.log("> title: " + title);
56
+          console.log("> next: " + link);
57
+          nightmare.wait(wait_time)
58
+          .pdf("book/" + i + "- " + title + ".pdf", options2)
59
+          if(link != null){
60
+            getPage(i + 1, "https://www.safaribooksonline.com" + link)
61
+          } else {
62
+            nightmare.end();
63
+          }
64
+
65
+        });
66
+      }
67
+      getPage(start_page, start_link);
68
+      //console.log("Found " + result.length + " links")
69
+    })
70
+} else {
71
+  console.log("Please provide a link for the scraping starting point.")
72
+}

+ 14 - 0
package.json

@@ -0,0 +1,14 @@
1
+{
2
+  "name": "betabot-scraper",
3
+  "version": "0.0.1",
4
+  "description": "Web scraping scripts for the Betabot project",
5
+  "main": "index.js",
6
+  "scripts": {
7
+    "test": "echo \"Error: no test specified yet\" && exit 1"
8
+  },
9
+  "author": "James Peret",
10
+  "license": "ISC",
11
+  "dependencies": {
12
+    "nightmare": "^2.4.1"
13
+  }
14
+}

+ 18 - 0
test.js

@@ -0,0 +1,18 @@
1
+var Nightmare = require('nightmare');
2
+var nightmare = Nightmare({ show: true })
3
+
4
+nightmare
5
+  .goto('http://yahoo.com')
6
+  .type('form[action*="/search"] [name=p]', 'github nightmare')
7
+  .click('form[action*="/search"] [type=submit]')
8
+  .wait('#main')
9
+  .evaluate(function () {
10
+    return document.querySelector('#main .searchCenterMiddle li a').href
11
+  })
12
+  .end()
13
+  .then(function (result) {
14
+    console.log(result)
15
+  })
16
+  .catch(function (error) {
17
+    console.error('Search failed:', error);
18
+  });

+ 68 - 0
test2.js

@@ -0,0 +1,68 @@
1
+var Nightmare = require('nightmare');
2
+var nightmare = Nightmare({ show: true })
3
+
4
+var options = {
5
+  marginsType: 1,
6
+  printBackground: false,
7
+  printSelectionOnly: false,
8
+  landscape: false,
9
+  pageSize: "A4"
10
+}
11
+
12
+
13
+nightmare
14
+  .viewport(1200, 800)
15
+  .goto('https://www.safaribooksonline.com/accounts/login/')
16
+  .type('form[action*="/accounts/login/"] [name=email]', 'james.peret@gmail.com')
17
+  .type('form[action*="/accounts/login/"] [name=password1]', 'woodstock512')
18
+  .click('form[action*="/accounts/login/"] [type=submit]')
19
+  .wait(1000)
20
+  .goto('https://www.safaribooksonline.com/library/view/learning-robotics-using/9781783287536/')
21
+  .wait(100)
22
+  .pdf('index.pdf', options)
23
+  .evaluate(function () {
24
+    var links = [];
25
+    $(".detail-toc li ol").each(function(){
26
+      var a_href = $(this).find('a').attr('href');
27
+      links.push(a_href);
28
+    });
29
+    toc_links = links
30
+    return links;
31
+  })
32
+  .then(function (result) {
33
+    console.log("Found " + result.length + " links.")
34
+    var getPage = function(i){
35
+      if(i < result.length){
36
+        console.log("Capturing page " + i + ":")
37
+        console.log(result[i])
38
+        nightmare.viewport(1200, 800)
39
+        .goto('https://www.safaribooksonline.com' + result[i])
40
+        .wait(1000)
41
+        .evaluate(function (i) {
42
+          console.log("> Copying content to body")
43
+          var $content = $('#sbo-rt-content').clone();
44
+          $('body').html($content);
45
+          console.log("> Changing print CSS")
46
+          $('head').append('<style type="text/css" media="print"><!-- @page { size: auto; margin: 25mm;} @media print { body {margin: 25mm;} } body { margin: 0px; padding: 0px;} #sbo-rt-content { max-width: 90%;} --></style>');
47
+          var title = $('h1.title').text();
48
+          return [i, title]
49
+        }, i)
50
+        .then(function (result) {
51
+          i = result[0]
52
+          var options2 = {
53
+            marginsType: 1,
54
+            printBackground: false,
55
+            printSelectionOnly: false,
56
+            landscape: false,
57
+            pageSize: "A4"
58
+          }
59
+          console.log(result[1]);
60
+          nightmare.pdf(i + "- " + result[1] + ".pdf", options2)
61
+          .wait(2000)
62
+          getPage(i + 1)
63
+        });
64
+      }
65
+    }
66
+    getPage(1);
67
+    //console.log("Found " + result.length + " links")
68
+  })