mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
notes, cleanup
This commit is contained in:
parent
752331eaae
commit
11a2286659
84
NOTES.md
Normal file
84
NOTES.md
Normal file
@ -0,0 +1,84 @@
|
||||
Each extractor should ultimately be an object that exports like so:
|
||||
|
||||
```javascript
|
||||
import GenericContentExtractor from './content/extractor'
|
||||
import GenericTitleExtractor from './title/extractor'
|
||||
import GenericAuthorExtractor from './author/extractor'
|
||||
import GenericDatePublishedExtractor from './date-published/extractor'
|
||||
import GenericDekExtractor from './dek/extractor'
|
||||
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
||||
|
||||
const GenericExtractor = {
|
||||
content: GenericContentExtractor,
|
||||
title: GenericTitleExtractor,
|
||||
author: GenericAuthorExtractor,
|
||||
datePublished: GenericDatePublishedExtractor,
|
||||
dek: GenericDekExtractor,
|
||||
leadImageUrl: GenericLeadImageUrlExtractor,
|
||||
}
|
||||
```
|
||||
|
||||
Custom parsers can then be merged with the generic parser to fill in gaps in their implementations. E.g:
|
||||
|
||||
```javascript
|
||||
import NYMagContentExtractor from '...'
|
||||
import NYMagTitleExtractor from '...'
|
||||
|
||||
const NYMagExtractor = {
|
||||
content: NYMagContentExtractor,
|
||||
title: NYMagTitleExtractor,
|
||||
}
|
||||
|
||||
const Extractor = {
|
||||
...GenericExtractor,
|
||||
...NYMagExtractor
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
# Declarative Custom Extractors
|
||||
|
||||
My goal is be to create declarative extractors that describe what rather than how. So, for example:
|
||||
|
||||
```javascript
|
||||
NYMagExtractor = {
|
||||
content: {
|
||||
// Order by most likely. Extractor will stop on first occurence
|
||||
selectors: [
|
||||
'div.article-content',
|
||||
'section.body',
|
||||
'article.article',
|
||||
],
|
||||
|
||||
// Selectors to remove from the extracted content
|
||||
clean: [
|
||||
'.ad',
|
||||
],
|
||||
|
||||
// Array of tranformations to make on matched elements
|
||||
// Each item in the array is an object. They key is the
|
||||
// selector, the value is a tranformation function
|
||||
// for the matching node.
|
||||
transforms: [
|
||||
// Convert h1s to h2s
|
||||
{
|
||||
'h1': ($node) => convertNodeTo($node, $, 'h2')
|
||||
},
|
||||
|
||||
// Convert lazy-loaded noscript images to figures
|
||||
{
|
||||
'noscript': ($node) => {
|
||||
const $children = $node.children()
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
convertNodeTo($node, $, 'figure')
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
title: [
|
||||
'h1',
|
||||
]
|
||||
}
|
||||
```
|
4
TODO.md
4
TODO.md
@ -1,5 +1,6 @@
|
||||
TODO:
|
||||
Tmrw:
|
||||
- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
|
||||
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
|
||||
- extractNextPageUrl
|
||||
- Try Closure webpack compiler
|
||||
- Rename all cleaners from cleanThing to clean
|
||||
@ -10,7 +11,6 @@ TODO:
|
||||
- Test if .is method is faster than regex methods
|
||||
- Separate constants into activity-specific folders (dom, scoring)
|
||||
|
||||
|
||||
DONE:
|
||||
x extractLeadImageUrl
|
||||
x extractDek
|
||||
|
@ -20,7 +20,7 @@ const GenericExtractor = {
|
||||
|
||||
// Cached value of every meta name in our document.
|
||||
// Used when extracting title/author/date_published/dek
|
||||
const metaCache = $('meta').map((index, node) => {
|
||||
const metaCache = $('meta').map((_, node) => {
|
||||
return $(node).attr('name')
|
||||
}).toArray()
|
||||
|
||||
|
@ -24,7 +24,7 @@ describe('GenericExtractor', () => {
|
||||
'California appears poised to be first to ban power-guzzling big-screen TVs'
|
||||
)
|
||||
assert.equal(
|
||||
datePublished.toISOString(),
|
||||
datePublished,
|
||||
'2009-10-14T04:00:00.000Z'
|
||||
)
|
||||
assert.equal(dek, null)
|
||||
|
Loading…
Reference in New Issue
Block a user