|
1 |
| -# Node [metafetch](https://www.npmjs.org/package/metafetch) |
| 1 | +# Node metafetch |
2 | 2 |
|
3 | 3 | [](https://github.com/brahma-dev/metafetch/actions/workflows/build.yml)
|
4 |
| -[](https://codecov.io/github/brahma-dev/metafetch) |
5 |
| -[](https://coveralls.io/github/brahma-dev/metafetch) |
6 |
| -[](https://snyk.io/test/npm/metafetch) |
| 4 | +[](https://codecov.io/gh/brahma-dev/metafetch) |
| 5 | +[](https://coveralls.io/github/brahma-dev/metafetch?branch=main) |
| 6 | +[](https://snyk.io/test/npm/metafetch) |
| 7 | +[](https://www.npmjs.org/package/metafetch) |
7 | 8 |
|
8 |
| -Metafetch fetches a given URL's title, description, images, links etc. |
| 9 | +**Metafetch** is a library to fetch and parse metadata from a web page. It can extract standard meta tags, Open Graph data, JSON-LD, favicons, and feeds, and even render client-side JavaScript to get data from Single-Page Applications (SPAs). |
| 10 | + |
| 11 | +## Key Features |
| 12 | + |
| 13 | +* **Comprehensive Metadata:** Extracts title, description, URL, site name, images, links, and more. |
| 14 | +* **Client-Side Rendering:** Uses **Puppeteer** to render JavaScript-heavy sites, ensuring you get the metadata even from SPAs (e.g., React, Vue, Svelte). |
| 15 | +* **Robust Network Handling:** Features built-in **request retries with exponential backoff** to handle transient network errors gracefully. |
| 16 | +* **Rich Content Discovery:** |
| 17 | + * Finds the best-quality **favicon** by prioritizing Apple touch icons and largest sizes. |
| 18 | + * Discovers **RSS/Atom feeds** linked in the page. |
| 19 | + * Parses and flattens structured **JSON-LD** data. |
| 20 | +* **Advanced Encoding Detection:** Accurately detects character encoding via BOM, HTTP headers, and meta tags to prevent garbled text. |
| 21 | +* **Highly Configurable:** Fine-tune requests with custom headers, user agents, and feature flags to parse only what you need. |
9 | 22 |
|
10 | 23 | ## Installation
|
11 | 24 |
|
12 | 25 | Use NPM to install:
|
13 | 26 |
|
14 |
| - npm install metafetch |
| 27 | +```bash |
| 28 | +npm install metafetch puppeteer |
| 29 | +``` |
| 30 | +*Note: `puppeteer` is a peer dependency and must be installed separately if you want to use the client-side rendering feature.* |
15 | 31 |
|
16 | 32 | ## Usage
|
17 | 33 |
|
18 |
| - import metafetch from 'metafetch'; |
19 |
| - |
20 |
| - metafetch.fetch('http://www.facebook.com'[, options]).then(function(meta) { |
21 |
| - console.log('title: ', meta.title); |
22 |
| - console.log('description: ', meta.description); |
23 |
| - console.log('type: ', meta.type); |
24 |
| - console.log('url: ', meta.url); |
25 |
| - console.log('ampURL: ', meta.ampURL); |
26 |
| - console.log('siteName: ', meta.siteName); |
27 |
| - console.log('charset: ', meta.charset); |
28 |
| - console.log('image: ', meta.image); |
29 |
| - console.log('meta: ', meta.meta); |
30 |
| - console.log('images: ', meta.images); |
31 |
| - console.log('links: ', meta.links); |
32 |
| - console.log('headers: ', meta.headers); |
33 |
| - console.log('language: ', meta.language); |
34 |
| - }).catch(console.error); |
35 |
| - |
36 |
| -#### Optional flags to disable parsing images and links and http timeout or headers |
37 |
| - |
38 |
| - metafetch.fetch('http://www.facebook.com', { |
39 |
| - userAgent: "User Agent/Defaults to Firefox 123 (February 20, 2024)", |
40 |
| - flags: { |
41 |
| - images: false, |
42 |
| - links: false, |
43 |
| - language: false |
44 |
| - }, |
45 |
| - http: { |
46 |
| - timeout: 30000, |
47 |
| - headers: { |
48 |
| - Accept: "*/*" |
49 |
| - } |
| 34 | +### Basic Example |
| 35 | + |
| 36 | +Here's a simple example using `async/await`. |
| 37 | + |
| 38 | +```javascript |
| 39 | +import metafetch from 'metafetch'; |
| 40 | + |
| 41 | +async function getMeta(url) { |
| 42 | + try { |
| 43 | + const meta = await metafetch.fetch(url); |
| 44 | + console.log(meta); |
| 45 | + } catch (err) { |
| 46 | + console.error(err); |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +getMeta('https://example.com'); |
| 51 | +``` |
| 52 | + |
| 53 | +This will output a representative object like this: |
| 54 | + |
| 55 | +```js |
| 56 | +{ |
| 57 | + title: 'Awesome Web Page', |
| 58 | + description: 'A compelling description of the page content, often used by search engines.', |
| 59 | + type: 'article', |
| 60 | + url: 'https://example.com/article-path', |
| 61 | + originalURL: 'https://example.com', |
| 62 | + siteName: 'Example News', |
| 63 | + charset: 'utf-8', |
| 64 | + image: 'https://example.com/images/featured-image.png', |
| 65 | + favicon: 'https://example.com/favicons/apple-touch-icon.png', |
| 66 | + feeds: [ 'https://example.com/rss.xml' ], |
| 67 | + meta: { |
| 68 | + 'og:title': 'Awesome Web Page', |
| 69 | + 'og:description': 'A compelling description...', |
| 70 | + 'ld:headline': 'Awesome Web Page', |
| 71 | + /* ... other meta and JSON-LD tags ... */ |
| 72 | + }, |
| 73 | + images: [ 'https://example.com/images/featured-image.png', /* ... */ ], |
| 74 | + links: [ 'https://example.com/another-page', /* ... */ ], |
| 75 | + headers: { 'content-type': 'text/html; charset=utf-8', /* ... */ }, |
| 76 | + language: 'en' |
| 77 | +} |
| 78 | +``` |
| 79 | + |
| 80 | +### Advanced Example (Rendering SPAs & Retries) |
| 81 | + |
| 82 | +To get metadata from a client-rendered page or to make your request more robust, use the advanced options. |
| 83 | + |
| 84 | +```javascript |
| 85 | +import metafetch from 'metafetch'; |
| 86 | + |
| 87 | +async function getSpaMeta(url) { |
| 88 | + try { |
| 89 | + const meta = await metafetch.fetch(url, { |
| 90 | + // Use puppeteer to render the page |
| 91 | + render: true, |
| 92 | + |
| 93 | + // Retry up to 2 times on failure |
| 94 | + retries: 2, |
| 95 | + retryDelay: 1000, // 1 second base delay |
| 96 | + |
| 97 | + // Disable parsing of things you don't need |
| 98 | + flags: { |
| 99 | + images: false, |
| 100 | + links: false |
| 101 | + }, |
| 102 | + |
| 103 | + // Pass custom options to the underlying fetch call |
| 104 | + fetch: { |
| 105 | + headers: { |
| 106 | + 'Accept-Language': 'en-US,en;q=0.9' |
50 | 107 | }
|
51 |
| - }).then(function(meta) { |
52 |
| - console.log('title: ', meta.title); |
53 |
| - console.log('description: ', meta.description); |
54 |
| - console.log('type: ', meta.type); |
55 |
| - console.log('url: ', meta.url); |
56 |
| - console.log('ampURL: ', meta.ampURL); |
57 |
| - console.log('siteName: ', meta.siteName); |
58 |
| - console.log('charset: ', meta.charset); |
59 |
| - console.log('image: ', meta.image); |
60 |
| - console.log('meta: ', meta.meta); |
61 |
| - console.log('headers: ', meta.headers); |
62 |
| - console.log('language: ', meta.language); |
63 |
| - }).catch(console.error);; |
64 |
| - |
65 |
| -#### Set User Agent across instance |
66 |
| - |
67 |
| - metafetch.setUserAgent("PersonalBot"); |
68 |
| - |
69 |
| -#### Multiple instances with different User Agent |
70 |
| - |
71 |
| - import { Metafetch } from 'metafetch'; |
72 |
| - const instance0 = new Metafetch("Bot 0"); |
73 |
| - const instance1 = new Metafetch("Bot 1"); |
74 |
| - |
75 |
| - -- or -- |
76 |
| - |
77 |
| - const instance0 = new Metafetch(); |
78 |
| - instance0.setUserAgent("Bot 0") |
79 |
| - const instance1 = new Metafetch(); |
80 |
| - instance1.setUserAgent("Bot 1") |
81 |
| - |
82 |
| -### Response Data |
83 |
| - |
84 |
| -- `title` : Page title. |
85 |
| -- `description` : Page description or `og:description` meta tag. |
86 |
| -- `image` : `og:image` meta tag. |
87 |
| -- `url` : Page url or `og:url` meta tag. |
88 |
| -- `ampURL` : URL from amphtml tag or null. |
89 |
| -- `images` : All images on this page. |
90 |
| -- `links` : All links on this page. |
91 |
| -- `meta` : All the meta tags that with a `property` or `name` attribute. e.g `<meta property="author" content="Example">`, `<meta name="description" content="Example.">` |
92 |
| -- `headers` : HTTP headers, lowercasing field names much like node does. |
93 |
| -- `language` : Content language (ISO 639-1) based on meta data/headers. |
| 108 | + } |
| 109 | + }); |
| 110 | + |
| 111 | + console.log('Title:', meta.title); |
| 112 | + console.log('Description:', meta.description); |
| 113 | + |
| 114 | + } catch (err) { |
| 115 | + console.error(err); |
| 116 | + } |
| 117 | +} |
| 118 | + |
| 119 | +getSpaMeta('https://my-single-page-app.com'); |
| 120 | +``` |
| 121 | + |
| 122 | +## API |
| 123 | + |
| 124 | +### `metafetch.fetch(url, [options])` |
| 125 | + |
| 126 | +Fetches and parses metadata from the given `url`. |
| 127 | + |
| 128 | +#### Options (`FetchOptions`) |
| 129 | + |
| 130 | +| Option | Type | Default | Description | |
| 131 | +|---|---|---|---| |
| 132 | +| `render` | `boolean` | `false` | If `true`, uses Puppeteer to render the page's JavaScript before parsing. | |
| 133 | +| `retries` | `number` | `0` | Number of times to retry the request on failure. | |
| 134 | +| `retryDelay` | `number` | `1000` | Base delay in milliseconds for retries (uses exponential backoff). | |
| 135 | +| `userAgent`| `string` | Firefox | The User-Agent string to use for the request. | |
| 136 | +| `flags` | `object` | `{...}` | An object to enable/disable parsing specific fields. All flags are `true` by default. See the list below for all available flags. | |
| 137 | +| `fetch` | `object` | `{}` | `RequestInit` options passed directly to the `fetch` call (e.g., `{ headers: {...} }`). | |
| 138 | + |
| 139 | +### Available Flags |
| 140 | + |
| 141 | +You can pass any of these boolean flags in the `flags` object to optimize parsing. For example, `flags: { links: false, images: false }` will skip extracting links and images. |
| 142 | + |
| 143 | +* `title` |
| 144 | +* `description` |
| 145 | +* `type` |
| 146 | +* `url` |
| 147 | +* `siteName` |
| 148 | +* `charset` |
| 149 | +* `image` |
| 150 | +* `meta` (includes all meta tags and flattened JSON-LD) |
| 151 | +* `images` |
| 152 | +* `links` |
| 153 | +* `headers` |
| 154 | +* `language` |
| 155 | +* `favicon` |
| 156 | +* `feeds` |
| 157 | + |
| 158 | +### Instance Management |
| 159 | + |
| 160 | +You can create multiple instances of `Metafetch`, each with its own default User-Agent. |
| 161 | + |
| 162 | +```javascript |
| 163 | +import { Metafetch } from 'metafetch'; |
| 164 | + |
| 165 | +// Create an instance with a custom default User-Agent |
| 166 | +const myBot = new Metafetch("MyPersonalBot/1.0"); |
| 167 | +await myBot.fetch('https://example.com'); |
| 168 | + |
| 169 | +// You can also change it later |
| 170 | +myBot.setUserAgent("MyPersonalBot/2.0"); |
| 171 | +console.log(myBot.userAgent); // "MyPersonalBot/2.0" |
| 172 | +``` |
| 173 | + |
| 174 | +## Response Data |
| 175 | + |
| 176 | +| Field | Type | Description | |
| 177 | +|---|---|---| |
| 178 | +| `title` | `string` | The page's `<title>` tag. | |
| 179 | +| `description` | `string` | The `og:description` or `description` meta tag. | |
| 180 | +| `type` | `string` | The `og:type` meta tag (e.g., "website", "article"). | |
| 181 | +| `url` | `string` | The final URL after redirects. Prioritizes `canonical` or `og:url`. | |
| 182 | +| `originalURL`| `string` | The URL that was originally passed to the fetch method. | |
| 183 | +| `ampURL` | `string` | The URL from a `<link rel="amphtml">` tag, if present. | |
| 184 | +| `siteName` | `string` | The `og:site_name` meta tag. | |
| 185 | +| `charset` | `string` | The detected character encoding of the page. | |
| 186 | +| `image` | `string` | The `og:image` or `twitter:image` meta tag. | |
| 187 | +| `favicon` | `string` | The best-quality favicon URL found on the page. | |
| 188 | +| `feeds` | `string[]`| An array of RSS/Atom feed URLs discovered on the page. | |
| 189 | +| `meta` | `object` | A key-value object of all meta tags and flattened JSON-LD data. | |
| 190 | +| `images` | `string[]`| An array of all absolute image URLs on the page. | |
| 191 | +| `links` | `string[]`| An array of all absolute hyperlink (`<a>`) URLs on the page. | |
| 192 | +| `headers` | `object` | An object containing the final response's HTTP headers. | |
| 193 | +| `language` | `string` | The content language (ISO 639-1) from the `<html>` tag or headers. | |
| 194 | + |
94 | 195 |
|
95 | 196 | ## License
|
96 | 197 |
|
|
0 commit comments