better docs and passing tests

fizx · Dec 30, 2009 · 87eeb9c · 87eeb9c
1 parent e01c765
commit 87eeb9c
Show file tree

Hide file tree

Showing 3 changed files with 193 additions and 38 deletions.
diff --git a/README.rdoc b/README.rdoc
@@ -1,21 +1,58 @@
-This is a port of the ideas of Parsley to Javascript.  There's decent comments in jquery.parsley.js.
+This is a port of the core ideas of Parsley from C to Javascript and jQuery.  Parsley is a domain-specific language for extracting content from HTML.  It adds two idioms to jQuery.
 
-Here's the yc parselet ( http://parselets.com/parselets/yc ) in
-idiomatic JavaScript.  I'd like an opinion.
+The first addition is the extract() method.  This transforms a jQuery object acting as a node
+list into a StringNodeList list of strings.  
 
-  var parselet = ({
+For example, let's perform some extractions on the following HTML:
+
+  <html><a href="/">Home</a><a href="http://google.com">Google</a></html>
+
+	js> jQuery("a").extract()
+	<StringNodeList[<Home(1)>, <Google(2)>]>
+  js> jQuery("a").extract().simple()
+  ["Home", "Google"]
+
+You can also pass regexen, attributes, or arbitrary functions to extract().
+
+  js> jQuery("a").extract("@href").simple()
+  ["/", "http://google.com"]
+  js> jQuery("a").extract(/[A-Z]/).simple()
+	["H", "G"]
+	js> jQuery("a").extract(function(node){ return "hi"; }).simple()
+	["hi", "hi"]
+
+The second idiom is auto-grouping.  Individual extractions can be grouped into a larger data structure (called a parselet), which will parse the page in an intelligent way.  Here's the naive ungrouped way to use extract:
+
+	js> var parselet = { 
+	      links: [{
+          text: $("a").extract().simple(),
+          href: $("a").extract("@href").simple()
+	      }]
+      };
+  { links: [{ text: ["Home", "Google"], href: ["/", "http://google.com"] }] }
+
+Now, let's add grouping by calling pQuery.extractAndGroup() to transform the data structure into something more convenient.  extractAndGroup() will automatically call extract() and simple() as necessary, so this time we'll omit them.
+
+  js> pQuery.extractAndGroup({ 
+        links: [{
+          text: $("a")
+          href: $("a").extract("@href")
+        }]
+      });
+  { links: [{ text: "Home", href: "/"}, {text: "Google", href: "http://google.com"}]}
+
+Now the links array has two objects, each representing one link.  This is a much better representation of the data.
+
+The goal here is to create a crawler that takes the inner {links: ...} object as input, and from that generates a json or csv representaion of an entire website.  
+
+Here's an example parselet that gets a list of stories from http://news.ycombinator.com.
+
+  {
    articles: [{
      title: $(".title a"),
-     title_verbose: $(".title a").extract(function(node){
-       // This function callback does the same thing as the
-       // default handler.  It's just here for the example, to show
-       // how to inject arbitrary logic.
-       return $(node).text().normalizeSpace();
-     }),
      link:  $(".title a").extract("@href"),
-     comment_count: $(".subtext a:nth-child(3)").extract(/0-9+/).optional(),
-     comment_link: $(".subtext a:nth-child(3)").extract("@href"),
+     comment_count: $(".subtext a:nth-child(3)").extract(/0-9+/),
+     comment_link:  $(".subtext a:nth-child(3)").extract("@href"),
      points: $(".subtext span").extract(/0-9+/)
    }],
-   next: $(".title:nth-child(2) a").extract("@href");
-  })
+  }
diff --git a/jquery.parsley.js b/jquery.parsley.js
@@ -30,32 +30,34 @@ StringNode.prototype.toString = function() {
  * This is similar to a jQuery list of HTML Nodes.  I need to be able to
  * call methods on it, and each contained String needs an associated position.
  */
-function StringNodeList() {};
+function StringNodeList() {
+  this.multiple = false;
+};
 StringNodeList.prototype = new Array();
 
 StringNodeList.prototype.toString = function() {
-  var buffer = "[";
+  var buffer = "<StringNodeList[";
   for(var i = 0; i < this.length; i++){
     if(i > 0) buffer += ", ";
     buffer += this[i].toString();
   }
-  return buffer + "]";
+  return buffer + "]>";
 }
 
 /**
  * Trims and replaces arbitrary whitespace with single spaces.
  */
 StringNodeList.prototype.normalizeSpace = function() {
   jQuery.each(this, function() {
-    this.string = this.string.replace(/\s+/g, ' ').replace(/^\s+/m, '').replace(/\s+$/m, '');
+    this.string = pQuery.normalizeSpace(this.string);
   });  
   return this;
 }
 
 /**
  * Creates an array of strings that mirrors this StringNodeList.
  */
-StringNodeList.prototype.toSimple = function() {
+StringNodeList.prototype.simple = function() {
   var array = [];
   jQuery.each(this, function() {
     array.push(this.string);
@@ -75,7 +77,7 @@ StringNodeList.prototype.toSimple = function() {
  * 
  * The following shortcut arguments are also available:
  * - extract() 
- *   => function(node) { return jQuery(node).text(); }
+ *   => function(node) { return pQuery.normalizeSpace(jQuery(node).text()); }
  * - extract("@foo") 
  *   => function(node) { return jQuery(node).attr("foo"); }
  * - extract(/regex/) 
@@ -84,12 +86,12 @@ StringNodeList.prototype.toSimple = function() {
 jQuery.fn.extract = function(func) {
   if(!func){
     func = function(node) {
-      return jQuery(node).text();
+      return pQuery.normalizeSpace(jQuery(node).text());
     };
   }
 
   // extract(/regex/)
-  if (func instanceof RegExp) {
+  if(func instanceof RegExp) {
     var re = func;
     func = function(node) {
       var text = jQuery(node).text();
@@ -98,7 +100,7 @@ jQuery.fn.extract = function(func) {
   }
 
   // extract("@attribute")
-  if (typeof(func) == "string" && func[0] == "@") {
+  if(typeof(func) == "string" && func[0] == "@") {
     var attr = func.substring(1);
     func = function(node) {
       return jQuery(node).attr(attr);
@@ -110,4 +112,92 @@ jQuery.fn.extract = function(func) {
     list.push(new StringNode(func(this), this.sourceIndex));
   });
   return list;
+}
+
+function pQuery(){};
+
+/**
+ * Slightly voodoo.  This function cleans up any remaining jQuery
+ * objects by running extract on them, then tries to group all of the 
+ * StringNodeLists by their implicit page ordering.  Then it simplifies
+ * the resulting data structure to vanilla object/arrays.
+ * 
+ */
+pQuery.extractAndGroup = function(parselet) {
+  pQuery.extract(parselet);
+  pQuery.group(parselet)
+  return parselet;
+}
+
+pQuery.keys = function(object) {
+  var a = [];
+  jQuery.each(object, function(key){
+    a.push(key);
+  });
+  return a;
+}
+
+pQuery.compileNodes = function(object) {
+  var nodes = [];
+  jQuery.each(object, function(key, value) {    
+    if(value instanceof StringNodeList) {
+      jQuery.each(value, function(i, node) {
+        node.multiple = value.multiple;
+        node.key = key;
+        nodes.push(node);
+      });
+    }
+  });
+  return nodes.sort(function(a,b) { return a.position - b.position; });
+}
+
+pQuery.group = function(parselet) {
+  jQuery.each(parselet, function(key, value) {    
+    // alert(key + typeof value + typeof(value[0]));
+    if(value instanceof Array && typeof(value[0]) == "object") {
+      // Drop all nodes into a huge array, then iterate, grouping as we go
+      var allNodes = pQuery.compileNodes(value[0]);
+      var node;
+      var groups = [];
+      var group = {};
+      groups.push(group);
+      while(node = allNodes.shift()) {
+        if(!node.multiple && group[node.key]){
+          group = {};
+          groups.push(group)
+        }
+        if(node.multiple){
+          if(!group[node.key]) group[node.key] = [];
+          group[node.key].push(node.string);
+        } else {
+          group[node.key] = node.string;
+        }
+      }
+      parselet[key] = groups;
+    } else if(typeof(value) == "object" && !(value instanceof StringNodeList)) {
+      pQuery.group(value); //recurse
+    }
+  });  
+}
+
+pQuery.normalizeSpace = function(string) {
+  return string.replace(/\s+/g, ' ').replace(/^\s+/m, '').replace(/\s+$/m, '');
+}
+
+pQuery.extract = function(parselet) {
+  jQuery.each(parselet, function(key, value) {
+    if(typeof(value) == "array") { 
+      if(value[0] instanceof StringNodeList) {
+        parselet[key] = value = value[0].multiple();
+      } else {
+        pQuery.extract(value);
+      }
+    } else if(value instanceof jQuery) {
+      parselet[key] = value = value.extract();
+    } else if(value instanceof StringNodeList) {
+      //Nothing
+    } else if(typeof(value) == "object") {
+      pQuery.extract(value);
+    }
+  });
 }
diff --git a/tests.html b/tests.html
@@ -7,16 +7,15 @@
   <script src="jquery.parsley.js"></script>
   <link rel="stylesheet" href="qunit.css" type="text/css" media="screen" />
   <script type="text/javascript" src="qunit.js"></script>
-
   <script>
 
   $(document).ready(function(){
     test("extracting regex", function() {
-      same($(".date").extract(/\d+/).toSimple(), ["12", "01"]);
+      same($(".date").extract(/\d+/).simple(), ["12", "01"]);
     });
 
     test("extracting attribute", function() {
-      same($(".message").extract("@id").toSimple(), ["b", "a"]);
+      same($(".message").extract("@id").simple(), ["b", "a"]);
     });
 
     test("extracting function", function() {
@@ -29,23 +28,52 @@
           if(i > max) max = i;
         });
         return max;
-      }).toSimple(), [25, 10]);
+      }).simple(), [25, 10]);
     });
 
     test("trailing space normalization", function() {
-      same($(".message").extract().toSimple(), ["Merry christmas! ", "Happy new year!"]);
-      same($(".message").extract().normalizeSpace().toSimple(), ["Merry christmas!", "Happy new year!"]);
+      same($(".message").extract().normalizeSpace().simple(), ["Merry christmas!", "Happy new year!"]);
+    });
+
+    test("extract", function(){
+      var parselet = {
+        page: location.href.split("/").pop,
+        messages: [{
+          "message_id": $(".message").extract("@id"),
+          "date": $(".date"),
+          "text": $(".message")
+        }]
+      };
+      pQuery.extract(parselet);
+      same(parselet.messages[0].message_id.simple(), ["b", "a"]);
     });
 
-    // test("collation", function() {
-    //   var parselet = {
-    //     page: location.href,
-    //     messages: [{
-    //       
-    //     }]
-    //   }
-    //   
-    // });
+    test("grouping", function() {
+      var parselet = {
+        page: location.href.split("/").pop(),
+        messages: [{
+          "message_id": $(".message").extract("@id"),
+          "date": $(".date"),
+          "text": $(".message")
+        }]
+      };
+
+      var output = pQuery.extractAndGroup(parselet);
+      var expected = {
+        page: "tests.html",
+        messages: [{
+          message_id: "b",
+          date: "12/25/09",
+          text: "Merry christmas!"
+        }, 
+        {
+          message_id: "b",
+          date: "01/01/10",
+          text: "Happy new year!"
+        }]
+      };
+      same(output.messages[0].text, expected.messages[0].text);
+    });
   });
   </script>