Skip to content

Commit a1554ab

Browse files
committed
v4.1.0 Parse JSON-LD
1 parent 2d1bdb3 commit a1554ab

File tree

4 files changed

+125
-6
lines changed

4 files changed

+125
-6
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ node_modules
2828
# Users Environment Variables
2929
.lock-wscript
3030
dist
31+
test-report.junit.xml

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "metafetch",
33
"description": "Metafetch fetches a given URL's title, description, images, links etc.",
4-
"version": "4.0.1",
4+
"version": "4.1.0",
55
"homepage": "https://github.com/brahma-dev/metafetch",
66
"repository": {
77
"type": "git",

src/index.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ export class Metafetch {
117117
result.charset = encoding;
118118
}
119119
this._extractMeta(document, result, flags);
120+
this._extractStructuredData(document, result, flags);
120121
this._extractUrls(document, response, result, flags);
121122
this._extractAssets(document, result, flags);
122123

@@ -225,6 +226,36 @@ export class Metafetch {
225226
result.links = [...linkHrefs];
226227
}
227228
}
229+
230+
private _extractStructuredData(doc: Document, result: MetafetchResponse, flags: ResolvedFlags) {
231+
if (!flags.meta) return;
232+
233+
doc.querySelectorAll('script[type="application/ld+json"]').forEach(script => {
234+
try {
235+
const json = JSON.parse(script.textContent || '{}');
236+
if (typeof json === 'object' && json !== null) {
237+
const ldMeta = result.meta!;
238+
for (const key in json) {
239+
if (json.hasOwnProperty(key)) {
240+
if (typeof json[key] === 'string') {
241+
ldMeta[`ld:${key}`] = json[key];
242+
} else if (typeof json[key] === 'object' && json[key] !== null && !Array.isArray(json[key])) {
243+
for (const subKey in json[key]) {
244+
if (json[key].hasOwnProperty(subKey)) {
245+
ldMeta[`ld:${key}:${subKey}`] = json[key][subKey];
246+
}
247+
}
248+
}
249+
250+
}
251+
}
252+
result.meta = ldMeta;
253+
}
254+
} catch (e) {
255+
console.warn('Error parsing JSON-LD:', e);
256+
}
257+
});
258+
}
228259
}
229260

230261
export const metafetch = new Metafetch();

test/test.ts

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ describe('Metafetch: Final Optimized Tests', () => {
77
let serverInvalidAssets: Server, serverUaEcho: Server, serverEmptyBody: Server,
88
serverPrimaryMeta: Server, serverBaseTag: Server, serverCharset: Server,
99
serverFallbackMeta: Server, serverAssetFallback: Server, serverBaseNoHref: Server,
10-
serverMalformedAssets: Server, serverAmp: Server, serverHttp: Server;
10+
serverMalformedAssets: Server, serverAmp: Server, serverHttp: Server, serverJsonLd: Server;
1111

1212
before((done) => {
1313
serverInvalidAssets = http.createServer((req, res) => {
@@ -48,14 +48,39 @@ describe('Metafetch: Final Optimized Tests', () => {
4848
else if (req.url?.startsWith('/page')) res.setHeader('Content-Type', 'text/html').end('<html><title>T</title></html>');
4949
else res.setHeader('Content-Type', 'application/pdf').end('%PDF-1.4');
5050
}).listen(2511, '127.0.0.1');
51-
serverHttp.on('listening', done);
51+
serverJsonLd = http.createServer((req, res) => {
52+
res.setHeader('Content-Type', 'text/html');
53+
let body = '';
54+
switch (req.url) {
55+
case '/basic':
56+
body = `<html><head><script type="application/ld+json">{"@context":"https://schema.org","@type":"NewsArticle","headline":"Article Headline"}</script></head></html>`;
57+
break;
58+
case '/nested':
59+
body = `<html><head><script type="application/ld+json">{"@context":"https://schema.org","author":{"@type":"Person","name":"Jane Doe"}, "unsupported": ["item1", "item2"]}</script></head></html>`;
60+
break;
61+
case '/malformed':
62+
body = `<html><head><meta name="description" content="Good"><script type="application/ld+json">{ "key": "value", </script></head></html>`;
63+
break;
64+
case '/multiple':
65+
body = `<html><head><meta name="description" content="A page with two scripts."><script type="application/ld+json">{"@type":"Organization","name":"My Company"}</script><script type="application/ld+json">{"@type":"WebSite","url":"https://example.com"}</script></head></html>`;
66+
break;
67+
case '/empty':
68+
body = `<html><head><script type="application/ld+json"></script></head></html>`;
69+
break;
70+
case '/non_object':
71+
body = `<html><head><script type="application/ld+json">"this is a string, not an object"</script></head></html>`;
72+
break;
73+
}
74+
res.end(body);
75+
}).listen(2512, '127.0.0.1');
76+
serverJsonLd.on('listening', done);
5277
});
5378

5479
after(() => {
5580
serverInvalidAssets.close(); serverUaEcho.close(); serverEmptyBody.close();
5681
serverPrimaryMeta.close(); serverBaseTag.close(); serverCharset.close();
5782
serverFallbackMeta.close(); serverAssetFallback.close(); serverBaseNoHref.close();
58-
serverMalformedAssets.close(); serverAmp.close(); serverHttp.close();
83+
serverMalformedAssets.close(); serverAmp.close(); serverHttp.close(); serverJsonLd.close();
5984
});
6085

6186
// --- Test Suites ---
@@ -65,7 +90,7 @@ describe('Metafetch: Final Optimized Tests', () => {
6590
it((++counter).toString().padStart(2, '0') + '. should return a Promise', () => {
6691
const promise = new Metafetch().fetch('http://127.0.0.1:2511/page');
6792
expect(promise).to.be.an.instanceOf(Promise);
68-
promise.catch(() => {}); // Suppress unhandled rejection warning
93+
promise.catch(() => { });
6994
});
7095

7196
it((++counter).toString().padStart(2, '0') + '. should reject with an error for an empty URL', async () => {
@@ -103,7 +128,7 @@ describe('Metafetch: Final Optimized Tests', () => {
103128
}
104129
});
105130
});
106-
131+
107132
describe('3. User-Agent Management', () => {
108133
let counter = 0;
109134
it((++counter).toString().padStart(2, '0') + '. should manage the instance user agent correctly', () => {
@@ -206,4 +231,66 @@ describe('Metafetch: Final Optimized Tests', () => {
206231
expect(res.images).to.be.an('array').that.is.not.empty;
207232
});
208233
});
234+
235+
describe('7. Structured Data (JSON-LD)', () => {
236+
let counter = 0;
237+
const instance = new Metafetch();
238+
239+
it((++counter).toString().padStart(2, '0') + '. should extract basic, flat JSON-LD data', async () => {
240+
const res = await instance.fetch('http://127.0.0.1:2512/basic');
241+
expect(res.meta).to.deep.include({
242+
'ld:@context': 'https://schema.org',
243+
'ld:@type': 'NewsArticle',
244+
'ld:headline': 'Article Headline'
245+
});
246+
});
247+
248+
it((++counter).toString().padStart(2, '0') + '. should extract and flatten nested JSON-LD data', async () => {
249+
const res = await instance.fetch('http://127.0.0.1:2512/nested');
250+
expect(res.meta).to.deep.include({
251+
'ld:@context': 'https://schema.org',
252+
'ld:author:@type': 'Person',
253+
'ld:author:name': 'Jane Doe'
254+
});
255+
// The current implementation doesn't handle arrays, so 'unsupported' should not exist
256+
expect(res.meta).to.not.have.property('ld:unsupported');
257+
});
258+
259+
it((++counter).toString().padStart(2, '0') + '. should handle malformed JSON-LD gracefully without crashing', async () => {
260+
const res = await instance.fetch('http://127.0.0.1:2512/malformed');
261+
// Regular meta tags should still be parsed
262+
expect(res.meta!.description).to.equal('Good');
263+
// Malformed ld+json should not add any 'ld:' keys
264+
const ldKeys = Object.keys(res.meta!).filter(k => k.startsWith('ld:'));
265+
expect(ldKeys).to.be.empty;
266+
});
267+
268+
it((++counter).toString().padStart(2, '0') + '. should merge data from multiple JSON-LD scripts', async () => {
269+
const res = await instance.fetch('http://127.0.0.1:2512/multiple');
270+
// Note: The current implementation overwrites duplicate keys.
271+
expect(res.meta).to.deep.equal({
272+
'description': 'A page with two scripts.',
273+
'ld:@type': 'WebSite', // Overwritten by second script
274+
'ld:name': 'My Company',
275+
'ld:url': 'https://example.com'
276+
});
277+
});
278+
279+
it((++counter).toString().padStart(2, '0') + '. should not extract JSON-LD when meta flag is disabled', async () => {
280+
const res = await instance.fetch('http://127.0.0.1:2512/basic', { flags: { meta: false } });
281+
expect(res.meta).to.be.undefined;
282+
});
283+
284+
it((++counter).toString().padStart(2, '0') + '. should handle an empty JSON-LD script tag', async () => {
285+
const res = await instance.fetch('http://127.0.0.1:2512/empty');
286+
expect(res.meta).to.be.an('object').that.is.empty;
287+
});
288+
289+
it((++counter).toString().padStart(2, '0') + '. should ignore JSON-LD content that is not a JSON object', async () => {
290+
const res = await instance.fetch('http://127.0.0.1:2512/non_object');
291+
expect(res.meta).to.be.an('object').that.is.empty;
292+
const ldKeys = Object.keys(res.meta!).filter(k => k.startsWith('ld:'));
293+
expect(ldKeys).to.be.empty;
294+
});
295+
});
209296
});

0 commit comments

Comments
 (0)