import xray from 'x-ray';
let x = xray();
const targetUrl = 'https://en.wikipedia.org/wiki/Category:Marvel_Comics_superheroes';
const selector = '#mw-pages .mw-category-group li a@href';
x(targetUrl, selector)((urlList) => {
// urlList is an array of all `href` values
});
Raw dump
Custom markup
No way!
{{Other uses}} {{About|Steve Rogers|the subsequent versions of the character|List of incarnations of Captain America}} {{pp-vandalism|expiry=03:36, 23 May 2016|small=yes}} {{Infobox comics character | image = CaptainAmerica109.jpg | converted = y | caption = ''Captain America'' #109 (Jan. 1969).Cover art by [[Jack Kirby]] and [[Syd Shores]]. | alt = Captain America bursting […]
Unofficial project
Community effort
structured data
Frozen in time (2015/08)
{
"abstract": [
{
"lang": "en",
"value": "Thor is a fictional character, a superhero […]"
}
],
"aliases": [
{
"lang": "en",
"value": "Dr. Donald Blake, Jake Olson, Sigurd Jarlson, Eric Masterson"
}
]
}
import infobox from 'wiki-infobox';
infobox('Hulk_(comics)', 'en', (err, data) => {
// {
// "character_name": {
// "type": "text",
// "value": "The Incredible Hulk"
// },
// "aliases": [
// {
// "type": "text",
// "value": "<br>Green Scar<br>World-Breaker<br>Jade Giant"
// },
// […]
// ],
// […]
// }
});
Only metadata
Just for aliases
npm module wikidata-sdk
{
"pageid": 81585,
"aliases": [
{
"value": "Peter Parker"
},
{
"value": "Webhead"
},
{
"value": "Spidey"
}
],
[…]
}
Pageviews count
Personal project
Dead after 2016/01
Biased by Netflix
{
"title": "Iron_Man",
"rank": 2111,
"daily_views": {
"2016-01-20": 2476,
"2016-01-19": 2394,
"2016-01-18": 2359,
"2016-01-17": 2196,
"2016-01-16": 2563,
"2016-01-15": 2661,
"2016-01-14": 2393,
[…]
}
}
No image in any API
Manual scrapping
x-ray
again
x(
'https://wikipedia.org/Wolverine',
'.infobox a.image img@src'
)(imageUrl => {
console.log(imageUrl);
});
2 years old
Unreliable:
retryUntilItWorks()
Abandoned by his mother, Matt Murdock was raised by his father, boxer "Battling Jack" Murdock […]
827 comics, 1326 stories
Different than API
Awesome for design
Manual scrapping
Various npm run
scripts
One per source
Run in isolation
Temporary save on disk
Scraping is easy and slow
Parsing is hard and fast
$ npm run dbpedia
./download/dbpedia
├── 8-Ball_(comics).json
├── Abdul_Alhazred_(comics).json
├── Abigail_Brand.json
[…]
├── Zombie_(comics).json
├── Zom.json
└── Zzzax.json
Merge all sources
Define fallbacks
Committed in git
Ordered keys
$ npm run consolidate
./download
├── dbpedia
├── images
├── infobox
├── marvel
│ ├── api
│ └── website
├── pageviews
├── urls
└── wikidata
./records
├── 8-Ball_(comics).json
[…]
└── Zzzax.json
Various patterns
Callbacks and promises
Make it chainable
f(input, (err, data) => {});
f(input)((err, data) => {});
HelperPath.createDir(infoboxDir)
.then(getUrls)
.then(getInfoboxes)
.then(saveToDisk)
.then(teardown);
import Promise from 'bluebird';
// infobox(url, (err, data) => {});
function infoboxAsPromise(url) {
return Promise.promisify(infobox)(url)
}
// x(url, context, selectors)((err, data) => { });
function xrayAsPromise(url, context, selector) {
let deferred = Promise.pending();
x(url, context, selectors)(err, data) => {
if (err) {
return deferred.reject(err);
}
deferred.resolve(data);
});
return deferred.promise;
}
malformed data
+ untrusted data
----------------
= unit testing!
$ npm run test
HelperDBPedia
isHero
✓ should be true if hero
getPowers
✓ should split on new lines
✓ should split on commas
✓ should work on arrays
✓ should remove comments
[…]
319 passing (598ms)
https://community.algolia.com/instantsearch.js/
library of UI widgets
fully customizable
eat your own dog food
10k records free
<h3>Teams</h3>
<div id="teams"></div>
[…].refinementList({
container: '#teams',
attributeName: 'teams',
operator: 'and',
limit: 10
})
.ais-refinement-list--label {
cursor: pointer;
font-weight: normal;
}
Image CDN
Resize and compress
Tons of effects
7.5k operations free
Free data
+ Free software
+ Free hosting
+ Free search
-------------
= Awesome
2 months free
https://www.algolia.com/redeem/PARISJS55