notes, cleanup

2024-11-17 03:25:31 +00:00 · 2016-09-06 09:55:36 -04:00 · 2016-09-06 09:55:36 -04:00 · 11a2286659
commit 11a2286659
parent 752331eaae
4 changed files with 89 additions and 5 deletions
--- a/NOTES.md
+++ b/NOTES.md
@ -0,0 +1,84 @@
+Each extractor should ultimately be an object that exports like so:
+
+```javascript
+import GenericContentExtractor from './content/extractor'
+import GenericTitleExtractor from './title/extractor'
+import GenericAuthorExtractor from './author/extractor'
+import GenericDatePublishedExtractor from './date-published/extractor'
+import GenericDekExtractor from './dek/extractor'
+import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
+
+const GenericExtractor = {
+  content: GenericContentExtractor,
+  title: GenericTitleExtractor,
+  author: GenericAuthorExtractor,
+  datePublished: GenericDatePublishedExtractor,
+  dek: GenericDekExtractor,
+  leadImageUrl: GenericLeadImageUrlExtractor,
+}
+```
+
+Custom parsers can then be merged with the generic parser to fill in gaps in their implementations. E.g:
+
+```javascript
+import NYMagContentExtractor from '...'
+import NYMagTitleExtractor from '...'
+
+const NYMagExtractor = {
+  content: NYMagContentExtractor,
+  title: NYMagTitleExtractor,
+}
+
+const Extractor = {
+  ...GenericExtractor,
+  ...NYMagExtractor
+}
+
+```
+
+# Declarative Custom Extractors
+
+My goal is be to create declarative extractors that describe what rather than how. So, for example:
+
+```javascript
+NYMagExtractor = {
+  content: {
+    // Order by most likely. Extractor will stop on first occurence
+    selectors: [
+      'div.article-content',
+      'section.body',
+      'article.article',
+    ],
+
+    // Selectors to remove from the extracted content
+    clean: [
+      '.ad',
+    ],
+
+    // Array of tranformations to make on matched elements
+    // Each item in the array is an object. They key is the 
+    // selector, the value is a tranformation function
+    // for the matching node.
+    transforms: [
+      // Convert h1s to h2s
+      {
+        'h1': ($node) => convertNodeTo($node, $, 'h2')
+      },
+
+      // Convert lazy-loaded noscript images to figures
+      {
+        'noscript': ($node) => {
+          const $children = $node.children()
+          if ($children.length === 1 && $children.get(0).tagName === 'img') {
+            convertNodeTo($node, $, 'figure')
+          }
+        }
+      }
+    ]
+  },
+
+  title: [
+    'h1',
+  ]
+}
+```
--- a/TODO.md
+++ b/TODO.md
@ -1,6 +1,7 @@
 TODO:
-  Tmrw:
-    - extractNextPageUrl
+- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
+- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
+- extractNextPageUrl
 - Try Closure webpack compiler
 - Rename all cleaners from cleanThing to clean
 - Make sure weightNodes flag is being passed properly
@ -10,7 +11,6 @@ TODO:
 - Test if .is method is faster than regex methods
 - Separate constants into activity-specific folders (dom, scoring)

-
 DONE:
 x extractLeadImageUrl
 x extractDek
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -20,7 +20,7 @@ const GenericExtractor = {

    // Cached value of every meta name in our document.
    // Used when extracting title/author/date_published/dek
-    const metaCache = $('meta').map((index, node) => {
+    const metaCache = $('meta').map((_, node) => {
      return $(node).attr('name')
    }).toArray()

--- a/src/extractor/generic/index.test.js
+++ b/src/extractor/generic/index.test.js
@ -24,7 +24,7 @@ describe('GenericExtractor', () => {
        'California appears poised to be first to ban power-guzzling big-screen TVs'
      )
      assert.equal(
-        datePublished.toISOString(),
+        datePublished,
        '2009-10-14T04:00:00.000Z'
      )
      assert.equal(dek, null)