|  | 
| 1 |  | -../README.md | 
|  | 1 | +# scraper | 
|  | 2 | + | 
|  | 3 | +[][crate] | 
|  | 4 | +[][crate] | 
|  | 5 | +[][tests] | 
|  | 6 | + | 
|  | 7 | +HTML parsing and querying with CSS selectors. | 
|  | 8 | + | 
|  | 9 | +`scraper` is on [Crates.io][crate] and [GitHub][github]. | 
|  | 10 | + | 
|  | 11 | +[crate]: https://crates.io/crates/scraper | 
|  | 12 | +[github]: https://github.com/causal-agent/scraper | 
|  | 13 | +[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml | 
|  | 14 | + | 
|  | 15 | +Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. | 
|  | 16 | + | 
|  | 17 | +## Examples | 
|  | 18 | + | 
|  | 19 | +### Parsing a document | 
|  | 20 | + | 
|  | 21 | +```rust | 
|  | 22 | +use scraper::Html; | 
|  | 23 | + | 
|  | 24 | +let html = r#" | 
|  | 25 | +    <!DOCTYPE html> | 
|  | 26 | +    <meta charset="utf-8"> | 
|  | 27 | +    <title>Hello, world!</title> | 
|  | 28 | +    <h1 class="foo">Hello, <i>world!</i></h1> | 
|  | 29 | +"#; | 
|  | 30 | + | 
|  | 31 | +let document = Html::parse_document(html); | 
|  | 32 | +``` | 
|  | 33 | + | 
|  | 34 | +### Parsing a fragment | 
|  | 35 | + | 
|  | 36 | +```rust | 
|  | 37 | +use scraper::Html; | 
|  | 38 | +let fragment = Html::parse_fragment("<h1>Hello, <i>world!</i></h1>"); | 
|  | 39 | +``` | 
|  | 40 | + | 
|  | 41 | +### Parsing a selector | 
|  | 42 | + | 
|  | 43 | +```rust | 
|  | 44 | +use scraper::Selector; | 
|  | 45 | +let selector = Selector::parse("h1.foo").unwrap(); | 
|  | 46 | +``` | 
|  | 47 | + | 
|  | 48 | +### Selecting elements | 
|  | 49 | + | 
|  | 50 | +```rust | 
|  | 51 | +use scraper::{Html, Selector}; | 
|  | 52 | + | 
|  | 53 | +let html = r#" | 
|  | 54 | +    <ul> | 
|  | 55 | +        <li>Foo</li> | 
|  | 56 | +        <li>Bar</li> | 
|  | 57 | +        <li>Baz</li> | 
|  | 58 | +    </ul> | 
|  | 59 | +"#; | 
|  | 60 | + | 
|  | 61 | +let fragment = Html::parse_fragment(html); | 
|  | 62 | +let selector = Selector::parse("li").unwrap(); | 
|  | 63 | + | 
|  | 64 | +for element in fragment.select(&selector) { | 
|  | 65 | +    assert_eq!("li", element.value().name()); | 
|  | 66 | +} | 
|  | 67 | +``` | 
|  | 68 | + | 
|  | 69 | +### Selecting descendent elements | 
|  | 70 | + | 
|  | 71 | +```rust | 
|  | 72 | +use scraper::{Html, Selector}; | 
|  | 73 | + | 
|  | 74 | +let html = r#" | 
|  | 75 | +    <ul> | 
|  | 76 | +        <li>Foo</li> | 
|  | 77 | +        <li>Bar</li> | 
|  | 78 | +        <li>Baz</li> | 
|  | 79 | +    </ul> | 
|  | 80 | +"#; | 
|  | 81 | + | 
|  | 82 | +let fragment = Html::parse_fragment(html); | 
|  | 83 | +let ul_selector = Selector::parse("ul").unwrap(); | 
|  | 84 | +let li_selector = Selector::parse("li").unwrap(); | 
|  | 85 | + | 
|  | 86 | +let ul = fragment.select(&ul_selector).next().unwrap(); | 
|  | 87 | +for element in ul.select(&li_selector) { | 
|  | 88 | +    assert_eq!("li", element.value().name()); | 
|  | 89 | +} | 
|  | 90 | +``` | 
|  | 91 | + | 
|  | 92 | +### Accessing element attributes | 
|  | 93 | + | 
|  | 94 | +```rust | 
|  | 95 | +use scraper::{Html, Selector}; | 
|  | 96 | + | 
|  | 97 | +let fragment = Html::parse_fragment(r#"<input name="foo" value="bar">"#); | 
|  | 98 | +let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); | 
|  | 99 | + | 
|  | 100 | +let input = fragment.select(&selector).next().unwrap(); | 
|  | 101 | +assert_eq!(Some("bar"), input.value().attr("value")); | 
|  | 102 | +``` | 
|  | 103 | + | 
|  | 104 | +### Serializing HTML and inner HTML | 
|  | 105 | + | 
|  | 106 | +```rust | 
|  | 107 | +use scraper::{Html, Selector}; | 
|  | 108 | + | 
|  | 109 | +let fragment = Html::parse_fragment("<h1>Hello, <i>world!</i></h1>"); | 
|  | 110 | +let selector = Selector::parse("h1").unwrap(); | 
|  | 111 | + | 
|  | 112 | +let h1 = fragment.select(&selector).next().unwrap(); | 
|  | 113 | + | 
|  | 114 | +assert_eq!("<h1>Hello, <i>world!</i></h1>", h1.html()); | 
|  | 115 | +assert_eq!("Hello, <i>world!</i>", h1.inner_html()); | 
|  | 116 | +``` | 
|  | 117 | + | 
|  | 118 | +### Accessing descendent text | 
|  | 119 | + | 
|  | 120 | +```rust | 
|  | 121 | +use scraper::{Html, Selector}; | 
|  | 122 | + | 
|  | 123 | +let fragment = Html::parse_fragment("<h1>Hello, <i>world!</i></h1>"); | 
|  | 124 | +let selector = Selector::parse("h1").unwrap(); | 
|  | 125 | + | 
|  | 126 | +let h1 = fragment.select(&selector).next().unwrap(); | 
|  | 127 | +let text = h1.text().collect::<Vec<_>>(); | 
|  | 128 | + | 
|  | 129 | +assert_eq!(vec!["Hello, ", "world!"], text); | 
|  | 130 | +``` | 
|  | 131 | + | 
|  | 132 | +### Manipulating the DOM | 
|  | 133 | + | 
|  | 134 | +```rust | 
|  | 135 | +use html5ever::tree_builder::TreeSink; | 
|  | 136 | +use scraper::{Html, Selector}; | 
|  | 137 | + | 
|  | 138 | +let html = "<html><body>hello<p class=\"hello\">REMOVE ME</p></body></html>"; | 
|  | 139 | +let selector = Selector::parse(".hello").unwrap(); | 
|  | 140 | +let mut document = Html::parse_document(html); | 
|  | 141 | +let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); | 
|  | 142 | +for id in node_ids { | 
|  | 143 | +    document.remove_from_parent(&id); | 
|  | 144 | +} | 
|  | 145 | +assert_eq!(document.html(), "<html><head></head><body>hello</body></html>"); | 
|  | 146 | +``` | 
|  | 147 | + | 
|  | 148 | +## Contributing | 
|  | 149 | + | 
|  | 150 | +Please feel free to open pull requests. If you're planning on implementing | 
|  | 151 | +something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) | 
|  | 152 | +then please open an issue first. | 
0 commit comments