Initial commit of working RSS Aggregator build

This commit is contained in:
2026-05-12 17:04:02 -03:00
parent ea3a2ca53e
commit 7ac2f6e384
4962 changed files with 1032666 additions and 0 deletions
+73
View File
@@ -0,0 +1,73 @@
const fields = module.exports = {};
fields.feed = [
['author', 'creator'],
['dc:publisher', 'publisher'],
['dc:creator', 'creator'],
['dc:source', 'source'],
['dc:title', 'title'],
['dc:type', 'type'],
'title',
'description',
'author',
'pubDate',
'webMaster',
'managingEditor',
'generator',
'link',
'language',
'copyright',
'lastBuildDate',
'docs',
'generator',
'ttl',
'rating',
'skipHours',
'skipDays',
];
fields.item = [
['author', 'creator'],
['dc:creator', 'creator'],
['dc:date', 'date'],
['dc:language', 'language'],
['dc:rights', 'rights'],
['dc:source', 'source'],
['dc:title', 'title'],
'title',
'link',
'pubDate',
'author',
'summary',
['content:encoded', 'content:encoded', {includeSnippet: true}],
'enclosure',
'dc:creator',
'dc:date',
'comments',
];
var mapItunesField = function(f) {
return ['itunes:' + f, f];
}
fields.podcastFeed = ([
'author',
'subtitle',
'summary',
'explicit'
]).map(mapItunesField);
fields.podcastItem = ([
'author',
'subtitle',
'summary',
'explicit',
'duration',
'image',
'episode',
'image',
'season',
'keywords',
'episodeType'
]).map(mapItunesField);
+349
View File
@@ -0,0 +1,349 @@
"use strict";
const http = require('http');
const https = require('https');
const xml2js = require('xml2js');
const url = require('url');
const fields = require('./fields');
const utils = require('./utils');
const DEFAULT_HEADERS = {
'User-Agent': 'rss-parser',
'Accept': 'application/rss+xml',
}
const DEFAULT_MAX_REDIRECTS = 5;
const DEFAULT_TIMEOUT = 60000;
class Parser {
constructor(options={}) {
options.headers = options.headers || {};
options.xml2js = options.xml2js || {};
options.customFields = options.customFields || {};
options.customFields.item = options.customFields.item || [];
options.customFields.feed = options.customFields.feed || [];
options.requestOptions = options.requestOptions || {};
if (!options.maxRedirects) options.maxRedirects = DEFAULT_MAX_REDIRECTS;
if (!options.timeout) options.timeout = DEFAULT_TIMEOUT;
this.options = options;
this.xmlParser = new xml2js.Parser(this.options.xml2js);
}
parseString(xml, callback) {
let prom = new Promise((resolve, reject) => {
this.xmlParser.parseString(xml, (err, result) => {
if (err) return reject(err);
if (!result) {
return reject(new Error('Unable to parse XML.'));
}
let feed = null;
if (result.feed) {
feed = this.buildAtomFeed(result);
} else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/^2/)) {
feed = this.buildRSS2(result);
} else if (result['rdf:RDF']) {
feed = this.buildRSS1(result);
} else if (result.rss && result.rss.$ && result.rss.$.version && result.rss.$.version.match(/0\.9/)) {
feed = this.buildRSS0_9(result);
} else if (result.rss && this.options.defaultRSS) {
switch(this.options.defaultRSS) {
case 0.9:
feed = this.buildRSS0_9(result);
break;
case 1:
feed = this.buildRSS1(result);
break;
case 2:
feed = this.buildRSS2(result);
break;
default:
return reject(new Error("default RSS version not recognized."))
}
} else {
return reject(new Error("Feed not recognized as RSS 1 or 2."))
}
resolve(feed);
});
});
prom = utils.maybePromisify(callback, prom);
return prom;
}
parseURL(feedUrl, callback, redirectCount=0) {
let xml = '';
let get = feedUrl.indexOf('https') === 0 ? https.get : http.get;
let urlParts = url.parse(feedUrl);
let headers = Object.assign({}, DEFAULT_HEADERS, this.options.headers);
let timeout = null;
let prom = new Promise((resolve, reject) => {
const requestOpts = Object.assign({headers}, urlParts, this.options.requestOptions);
let req = get(requestOpts, (res) => {
if (this.options.maxRedirects && res.statusCode >= 300 && res.statusCode < 400 && res.headers['location']) {
if (redirectCount === this.options.maxRedirects) {
return reject(new Error("Too many redirects"));
} else {
const newLocation = url.resolve(feedUrl, res.headers['location']);
return this.parseURL(newLocation, null, redirectCount + 1).then(resolve, reject);
}
} else if (res.statusCode >= 300) {
return reject(new Error("Status code " + res.statusCode))
}
let encoding = utils.getEncodingFromContentType(res.headers['content-type']);
res.setEncoding(encoding);
res.on('data', (chunk) => {
xml += chunk;
});
res.on('end', () => {
return this.parseString(xml).then(resolve, reject);
});
})
req.on('error', reject);
timeout = setTimeout(() => {
return reject(new Error("Request timed out after " + this.options.timeout + "ms"));
}, this.options.timeout);
}).then(data => {
clearTimeout(timeout);
return Promise.resolve(data);
}, e => {
clearTimeout(timeout);
return Promise.reject(e);
});
prom = utils.maybePromisify(callback, prom);
return prom;
}
buildAtomFeed(xmlObj) {
let feed = {items: []};
utils.copyFromXML(xmlObj.feed, feed, this.options.customFields.feed);
if (xmlObj.feed.link) {
feed.link = utils.getLink(xmlObj.feed.link, 'alternate', 0);
feed.feedUrl = utils.getLink(xmlObj.feed.link, 'self', 1);
}
if (xmlObj.feed.title) {
let title = xmlObj.feed.title[0] || '';
if (title._) title = title._
if (title) feed.title = title;
}
if (xmlObj.feed.updated) {
feed.lastBuildDate = xmlObj.feed.updated[0];
}
feed.items = (xmlObj.feed.entry || []).map(entry => this.parseItemAtom(entry));
return feed;
}
parseItemAtom(entry) {
let item = {};
utils.copyFromXML(entry, item, this.options.customFields.item);
if (entry.title) {
let title = entry.title[0] || '';
if (title._) title = title._;
if (title) item.title = title;
}
if (entry.link && entry.link.length) {
item.link = utils.getLink(entry.link, 'alternate', 0);
}
if (entry.published && entry.published.length && entry.published[0].length) item.pubDate = new Date(entry.published[0]).toISOString();
if (!item.pubDate && entry.updated && entry.updated.length && entry.updated[0].length) item.pubDate = new Date(entry.updated[0]).toISOString();
if (entry.author && entry.author.length && entry.author[0].name && entry.author[0].name.length) item.author = entry.author[0].name[0];
if (entry.content && entry.content.length) {
item.content = utils.getContent(entry.content[0]);
item.contentSnippet = utils.getSnippet(item.content)
}
if (entry.summary && entry.summary.length) {
item.summary = utils.getContent(entry.summary[0]);
}
if (entry.id) {
item.id = entry.id[0];
}
this.setISODate(item);
return item;
}
buildRSS0_9(xmlObj) {
var channel = xmlObj.rss.channel[0];
var items = channel.item;
return this.buildRSS(channel, items);
}
buildRSS1(xmlObj) {
xmlObj = xmlObj['rdf:RDF'];
let channel = xmlObj.channel[0];
let items = xmlObj.item;
return this.buildRSS(channel, items);
}
buildRSS2(xmlObj) {
let channel = xmlObj.rss.channel[0];
let items = channel.item;
let feed = this.buildRSS(channel, items);
if (xmlObj.rss.$ && xmlObj.rss.$['xmlns:itunes']) {
this.decorateItunes(feed, channel);
}
return feed;
}
buildRSS(channel, items) {
items = items || [];
let feed = {items: []};
let feedFields = fields.feed.concat(this.options.customFields.feed);
let itemFields = fields.item.concat(this.options.customFields.item);
if (channel['atom:link'] && channel['atom:link'][0] && channel['atom:link'][0].$) {
feed.feedUrl = channel['atom:link'][0].$.href;
}
if (channel.image && channel.image[0] && channel.image[0].url) {
feed.image = {};
let image = channel.image[0];
if (image.link) feed.image.link = image.link[0];
if (image.url) feed.image.url = image.url[0];
if (image.title) feed.image.title = image.title[0];
if (image.width) feed.image.width = image.width[0];
if (image.height) feed.image.height = image.height[0];
}
const paginationLinks = this.generatePaginationLinks(channel);
if (Object.keys(paginationLinks).length) {
feed.paginationLinks = paginationLinks;
}
utils.copyFromXML(channel, feed, feedFields);
feed.items = items.map(xmlItem => this.parseItemRss(xmlItem, itemFields));
return feed;
}
parseItemRss(xmlItem, itemFields) {
let item = {};
utils.copyFromXML(xmlItem, item, itemFields);
if (xmlItem.enclosure) {
item.enclosure = xmlItem.enclosure[0].$;
}
if (xmlItem.description) {
item.content = utils.getContent(xmlItem.description[0]);
item.contentSnippet = utils.getSnippet(item.content);
}
if (xmlItem.guid) {
item.guid = xmlItem.guid[0];
if (item.guid._) item.guid = item.guid._;
}
if (xmlItem.$ && xmlItem.$['rdf:about']) {
item['rdf:about'] = xmlItem.$['rdf:about']
}
if (xmlItem.category) item.categories = xmlItem.category;
this.setISODate(item);
return item;
}
/**
* Add iTunes specific fields from XML to extracted JSON
*
* @access public
* @param {object} feed extracted
* @param {object} channel parsed XML
*/
decorateItunes(feed, channel) {
let items = channel.item || [];
let categories = [];
feed.itunes = {}
if (channel['itunes:owner']) {
let owner = {};
if(channel['itunes:owner'][0]['itunes:name']) {
owner.name = channel['itunes:owner'][0]['itunes:name'][0];
}
if(channel['itunes:owner'][0]['itunes:email']) {
owner.email = channel['itunes:owner'][0]['itunes:email'][0];
}
feed.itunes.owner = owner;
}
if (channel['itunes:image']) {
let image;
let hasImageHref = (channel['itunes:image'][0] &&
channel['itunes:image'][0].$ &&
channel['itunes:image'][0].$.href);
image = hasImageHref ? channel['itunes:image'][0].$.href : null;
if (image) {
feed.itunes.image = image;
}
}
if (channel['itunes:category']) {
const categoriesWithSubs = channel['itunes:category'].map((category) => {
return {
name: category && category.$ && category.$.text,
subs: category['itunes:category'] ?
category['itunes:category']
.map((subcategory) => ({
name: subcategory && subcategory.$ && subcategory.$.text
})) : null,
};
});
feed.itunes.categories = categoriesWithSubs.map((category) => category.name);
feed.itunes.categoriesWithSubs = categoriesWithSubs;
}
if (channel['itunes:keywords']) {
if (channel['itunes:keywords'].length > 1) {
feed.itunes.keywords = channel['itunes:keywords'].map(
keyword => keyword && keyword.$ && keyword.$.text
);
} else {
let keywords = channel['itunes:keywords'][0];
if (keywords && typeof keywords._ === 'string') {
keywords = keywords._;
}
if (keywords && keywords.$ && keywords.$.text) {
feed.itunes.keywords = keywords.$.text.split(',')
} else if (typeof keywords === "string") {
feed.itunes.keywords = keywords.split(',');
}
}
}
utils.copyFromXML(channel, feed.itunes, fields.podcastFeed);
items.forEach((item, index) => {
let entry = feed.items[index];
entry.itunes = {};
utils.copyFromXML(item, entry.itunes, fields.podcastItem);
let image = item['itunes:image'];
if (image && image[0] && image[0].$ && image[0].$.href) {
entry.itunes.image = image[0].$.href;
}
});
}
setISODate(item) {
let date = item.pubDate || item.date;
if (date) {
try {
item.isoDate = new Date(date.trim()).toISOString();
} catch (e) {
// Ignore bad date format
}
}
}
/**
* Generates a pagination object where the rel attribute is the key and href attribute is the value
* { self: 'self-url', first: 'first-url', ... }
*
* @access private
* @param {Object} channel parsed XML
* @returns {Object}
*/
generatePaginationLinks(channel) {
if (!channel['atom:link']) {
return {};
}
const paginationRelAttributes = ['self', 'first', 'next', 'prev', 'last'];
return channel['atom:link'].reduce((paginationLinks, link) => {
if (!link.$ || !paginationRelAttributes.includes(link.$.rel)) {
return paginationLinks;
}
paginationLinks[link.$.rel] = link.$.href;
return paginationLinks;
}, {});
}
}
module.exports = Parser;
+85
View File
@@ -0,0 +1,85 @@
const utils = module.exports = {};
const entities = require('entities');
const xml2js = require('xml2js');
utils.stripHtml = function(str) {
str = str.replace(/([^\n])<\/?(h|br|p|ul|ol|li|blockquote|section|table|tr|div)(?:.|\n)*?>([^\n])/gm, '$1\n$3')
str = str.replace(/<(?:.|\n)*?>/gm, '');
return str;
}
utils.getSnippet = function(str) {
return entities.decodeHTML(utils.stripHtml(str)).trim();
}
utils.getLink = function(links, rel, fallbackIdx) {
if (!links) return;
for (let i = 0; i < links.length; ++i) {
if (links[i].$.rel === rel) return links[i].$.href;
}
if (links[fallbackIdx]) return links[fallbackIdx].$.href;
}
utils.getContent = function(content) {
if (typeof content._ === 'string') {
return content._;
} else if (typeof content === 'object') {
let builder = new xml2js.Builder({headless: true, explicitRoot: true, rootName: 'div', renderOpts: {pretty: false}});
return builder.buildObject(content);
} else {
return content;
}
}
utils.copyFromXML = function(xml, dest, fields) {
fields.forEach(function(f) {
let from = f;
let to = f;
let options = {};
if (Array.isArray(f)) {
from = f[0];
to = f[1];
if (f.length > 2) {
options = f[2];
}
}
const { keepArray, includeSnippet } = options;
if (xml[from] !== undefined){
dest[to] = keepArray ? xml[from] : xml[from][0];
}
if (dest[to] && typeof dest[to]._ === 'string') {
dest[to]=dest[to]._;
}
if (includeSnippet && dest[to] && typeof dest[to] === 'string') {
dest[to + 'Snippet'] = utils.getSnippet(dest[to]);
}
})
}
utils.maybePromisify = function(callback, promise) {
if (!callback) return promise;
return promise.then(
data => setTimeout(() => callback(null, data)),
err => setTimeout(() => callback(err))
);
}
const DEFAULT_ENCODING = 'utf8';
const ENCODING_REGEX = /(encoding|charset)\s*=\s*(\S+)/;
const SUPPORTED_ENCODINGS = ['ascii', 'utf8', 'utf16le', 'ucs2', 'base64', 'latin1', 'binary', 'hex'];
const ENCODING_ALIASES = {
'utf-8': 'utf8',
'iso-8859-1': 'latin1',
}
utils.getEncodingFromContentType = function(contentType) {
contentType = contentType || '';
let match = contentType.match(ENCODING_REGEX);
let encoding = (match || [])[2] || '';
encoding = encoding.toLowerCase();
encoding = ENCODING_ALIASES[encoding] || encoding;
if (!encoding || SUPPORTED_ENCODINGS.indexOf(encoding) === -1) {
encoding = DEFAULT_ENCODING;
}
return encoding;
}