123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448 |
- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
- <html>
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
- <title>PHP Simple HTML DOM Parser: Manual</title>
- <link href="css/default.css" rel="stylesheet" type="text/css">
- <link rel="stylesheet" href="css/ui.tabs.css" type="text/css" media="print, projection, screen">
- <script type="text/javascript" src="js/jquery-1.2.3.pack.js"></script>
- <script type="text/javascript" src="js/ui.tabs.pack.js"></script>
- <script language="JavaScript" type="text/JavaScript">
- $(document).ready(function(){
- $(function() {$('#container_quickstart > ul').tabs();});
- $(function() {$('#container_create > ul').tabs();});
- $(function() {$('#container_find > ul').tabs();});
- $(function() {$('#container_access > ul').tabs();});
- $(function() {$('#container_traverse > ul').tabs();});
- $(function() {$('#container_dump > ul').tabs();});
- $(function() {$('#container_callback > ul').tabs();});
- });
- </script>
- </head>
- <body>
- <h1><a name="top"></a>PHP Simple HTML DOM Parser Manual</h1>
- <div id="content">
- <h2>Index</h2>
- <ul>
- <li><a href="#section_quickstart">Quick Start</a></li>
- <li><a href="#section_create">How to create HTML DOM object?</a></li>
- <li><a href="#section_find">How to find HTML elements?</a></li>
- <li><a href="#section_access">How to access the HTML element's attributes?</a> </li>
- <li><a href="#section_traverse">How to traverse the DOM tree?</a></li>
- <li><a href="#section_dump">How to dump contents of DOM object?</a></li>
- <li><a href="#section_callback">How to customize the parsing behavior?</a></li>
- <li><a href="manual_api.htm">API Reference</a></li>
- <li><a href="manual_faq.htm">FAQ</a></li>
- </ul>
-
- <a name="section_quickstart"></a>
- <h2>Quick Start</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_quickstart">
- <ul>
- <li><a href="#fragment-11"><span>Get HTML elements</span></a></li>
- <li><a href="#fragment-12"><span>Modify HTML elements</span></a></li>
- <li><a href="#fragment-13"><span>Extract contents from HTML</span></a></li>
- <li><a href="#fragment-14"><span>Scraping Slashdot!</span></a></li>
- </ul>
- <div id="fragment-11">
- <div class="code">
- <span class="comment">// Create DOM from URL or file</span><br>
- $html = <strong>file_get_html</strong>(<span class="var">'http://www.google.com/'</span>);<br>
- <br>
- <span class="comment">// Find all images </span><br>
- foreach($html-><strong>find</strong>(<span class="var">'img'</span>) as $element) <br>
- echo $element-><strong>src</strong> . <span class="var">'<br>'</span>;<br>
- <br>
- <span class="comment">// Find all links </span><br>
- foreach($html-><strong>find</strong>(<span class="var">'a'</span>) as $element) <br>
- echo $element-><strong>href</strong> . <span class="var">'<br>'</span>; </div>
- </div>
- <div id="fragment-12">
- <div class="code">
- <span class="comment">// Create DOM from string</span><br>
- $html = <strong>str_get_html</strong>(<span class="var">'<div id="hello">Hello</div><div id="world">World</div>'</span>);<span class="comment"><br>
- <br>
- </span>
-
- $html-><strong>find</strong>(<span class="var">'div', 1</span>)-><strong>class</strong> = <span class="var">'bar'</span>;<br>
- <br>
- $html-><strong>find</strong>(<span class="var">'div[id=hello]', 0</span>)-><strong>innertext</strong> = <span class="var">'foo'</span>;<br>
- <br>
- echo $html; <span class="comment">// Output: <div id="hello"><strong>foo</strong></div><div id="world" <strong>class="bar"</strong>>World</div></span> </div>
- </div>
- <div id="fragment-13">
- <div class="code"><br>
- <span class="comment">// Dump contents (without tags) from HTML</span><br>
- echo <strong>file_get_html</strong>(<span class="var">'http://www.google.com/'</span>)-><strong>plaintext</strong>;
- <br>
- <br>
- </div>
- </div>
- <div id="fragment-14">
- <div class="code">
- <span class="comment">// Create DOM from URL</span><br>
- $html = <strong>file_get_html</strong>(<span class="var">'http://slashdot.org/'</span>);<br>
- <br>
- <span class="comment">// Find all article blocks</span><br>
- foreach($html-><strong>find</strong>(<span class="var">'div.article'</span>) as $article) {<br>
- $item[<span class="var">'title'</span>] = $article-><strong>find</strong>(<span class="var">'div.title'</span>, <span class="var">0</span>)-><strong>plaintext</strong>;<br>
- $item[<span class="var">'intro'</span>] = $article-><strong>find</strong>(<span class="var">'div.intro'</span>, <span class="var">0</span>)-><strong>plaintext</strong>;<br>
- $item[<span class="var">'details'</span>] = $article-><strong>find</strong>(<span class="var">'div.details'</span>, <span class="var">0</span>)-><strong>plaintext</strong>;<br>
- $articles[] = $item;<br>
- }<br>
- <br>
- print_r($articles);
- </div>
- </div>
- </div>
-
- <a name="section_create"></a>
- <h2>How to create HTML DOM object?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_create">
- <ul>
- <li><a href="#frag_create_quick"><span>Quick way</span></a></li>
- <li><a href="#frag_create_oo"><span>Object-oriented way</span></a></li>
- </ul>
- <div id="frag_create_quick">
- <div class="code"><span class="comment">// Create a DOM object from a string</span><br>
- $html = <strong>str_get_html</strong>(<span class="var">'<html><body>Hello!</body></html>'</span>);<br>
- <br>
- <span class="comment">// Create a DOM object from a URL</span><br>
- $html = <strong>file_get_html</strong>(<span class="var">'http://www.google.com/'</span>);<br>
- <br>
- <span class="comment">// Create a DOM object from a HTML file</span><br>
- $html = <strong>file_get_html</strong>(<span class="var">'test.htm'</span>);<span class="comment"><br>
- </span></div>
- </div>
- <div id="frag_create_oo">
- <div class="code"><span class="comment">// Create a DOM object</span><br>
- $html = new <strong>simple_html_dom</strong>();<br>
- <br>
- <span class="comment">// Load HTML from a string</span><br>
- $html-><strong>load</strong>(<span class="var">'<html><body>Hello!</body></html>'</span>);<br>
- <br>
- <span class="comment">// Load HTML from a URL </span> <br>
- $html-><strong>load_file</strong>(<span class="var">'http://www.google.com/'</span>);<br>
- <br>
- <span class="comment">// Load HTML from a HTML file</span> <br>
- $html-><strong>load_file</strong>(<span class="var">'test.htm'</span>);</div>
- </div>
- </div>
- <a name="section_find"></a>
- <h2>How to find HTML elements?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_find">
- <ul>
- <li><a href="#frag_find_basic"><span>Basics</span></a></li>
- <li><a href="#frag_find_advanced"><span>Advanced</span></a></li>
- <li><a href="#frag_find_chain"><span>Descendant selectors</span></a></li>
- <li><a href="#frag_find_nested"><span>Nested selectors</span></a></li>
- <li><a href="#frag_find_attr"><span>Attribute Filters</span></a></li>
- <li><a href="#frag_find_textcomment"><span>Text & Comments</span></a></li>
- </ul>
- <div id="frag_find_basic">
- <div class="code"> <span class="comment">// Find all <strong>anchors</strong>, returns a <strong>array</strong> of element objects</span><br>
- $ret = $html->find(<span class="var">'<strong>a</strong>'</span>);<br>
- <br>
- <span class="comment">// Find <strong>(N)th</strong> <strong>anchor</strong>, returns element object or <strong>null</strong> if not found</span> <span class="comment">(zero based)</span><br>
- $ret = $html->find(<span class="var">'<strong>a</strong>', <strong>0</strong></span>);<br>
- <br>
- <span class="comment">// Find <strong>lastest</strong> <strong>anchor</strong>, returns element object or <strong>null</strong> if not found</span> <span class="comment">(zero based)</span><br>
- $ret = $html->find(<span class="var">'<strong>a</strong>', <strong>-1</strong></span>); <br>
- <br>
- <span class="comment">// Find all <strong><div></strong> with the <strong>id</strong> attribute</span><br>
- $ret = $html->find(<span class="var">'<strong>div[id]</strong>'</span>);<br>
- <br>
- <span class="comment">// Find all <strong><div></strong> which attribute <strong>id=foo</strong></span><br>
- $ret = $html->find(<span class="var">'<strong>div[id=foo]</strong>'</span>); <br>
- </div>
- </div>
- <div id="frag_find_advanced">
- <div class="code"><span class="comment">// Find all element which <strong>id</strong>=foo</span><br>
- $ret = $html->find(<span class="var">'<strong>#foo</strong>'</span>);<br>
- <br>
- <span class=comment>// Find all element which <strong>class</strong>=foo</span><br>
- $ret = $html->find(<span class=var>'<strong>.foo</strong>'</span>);<br>
- <br>
- <span class="comment">// Find all element has attribute<strong> id</strong></span><br>
- $ret = $html->find(<span class="var">'<strong>*[id]</strong>'</span>); <br>
- <br>
- <span class="comment">// Find all <strong>anchors</strong> and <strong>images</strong> </span><br>
- $ret = $html->find(<span class="var">'<strong>a, img</strong>'</span>); <br>
- <br>
- <span class="comment">// Find all <strong>anchors</strong> and <strong>images</strong> with the "title" attribute</span><br>
- $ret = $html->find(<span class="var">'<strong>a[title], img[title]</strong>'</span>);<br>
- </div>
- </div>
- <div id="frag_find_attr">
- <div class="code">
- Supports these operators in attribute selectors:<br><br>
- <table cellpadding="1" cellspacing="1">
- <tr>
- <th width="25%">Filter</th>
- <th width="75%">Description</th>
- </tr>
- <tr>
- <td>[attribute]</td>
- <td>Matches elements that <strong>have</strong> the specified attribute.</td>
- </tr>
- <tr>
- <td>[!attribute]</td>
- <td>Matches elements that <strong>don't have</strong> the specified attribute.</td>
- </tr>
- <tr>
- <td>[attribute=value]</td>
- <td>Matches elements that have the specified attribute with a <strong>certain value</strong>.</td>
- </tr>
- <tr>
- <td>[attribute!=value]</td>
- <td>Matches elements that <strong>don't have</strong> the specified attribute with a certain value.</td>
- </tr>
- <tr>
- <td>[attribute^=value]</td>
- <td>Matches elements that have the specified attribute and it <strong>starts</strong> with a certain value.</td>
- </tr>
- <tr>
- <td>[attribute$=value]</td>
- <td>Matches elements that have the specified attribute and it <strong>ends</strong> with a certain value.</td>
- </tr>
- <tr>
- <td>[attribute*=value]</td>
- <td>Matches elements that have the specified attribute and it <strong>contains</strong> a certain value.</td>
- </tr>
- </table>
- </div>
- </div>
- <div id="frag_find_chain">
- <div class="code"><span class="comment">// Find all <li> in <ul> </span><br>
- $es = $html->find(<span class="var">'<strong>ul li</strong>'</span>);<br>
- <br>
- <span class="comment">// Find Nested <div> </span><span class="comment">tags</span><br>
- $es = $html->find(<span class="var">'<strong>div div div</strong>'</span>); <br>
- <br>
- <span class="comment">// Find all <td> in <table> which class=hello </span><br>
- $es = $html->find(<span class="var">'<strong>table.hello td</strong>'</span>);<br>
- <br>
- <span class="comment">// Find all td tags with attribite align=center in table tags </span><br>
- $es = $html->find(<span class="var">''<strong>table</strong><strong> td[align=center]</strong>'</span>);<br>
- </div>
- </div>
- <div id="frag_find_textcomment">
- <div class="code"><span class="comment"> // Find all text blocks </span><br>
- $es = $html->find(<span class="var">'<strong>text</strong>'</span>);<br>
- <br>
- <span class="comment">// Find all comment (<!--...-->) blocks </span><br>
- $es = $html->find(<span class="var">'<strong>comment</strong>'</span>);<br>
- </div>
- </div>
- <div id="frag_find_nested">
- <div class="code"> <span class="comment">// Find all <li> in <ul> </span><br>
- foreach($html->find(<span class="var">'<strong>ul</strong>'</span>) as $ul) <br>
- {<br>
- foreach($ul->find(<span class="var">'<strong>li</strong>'</span>) as $li) <br>
- {<br>
- <span class="comment">// do something...</span><br>
- }<br>
- }<br>
- <br>
- <span class="comment">// Find first <li> in first <ul></span> <br>
- $e = $html->find(<span class="var">'<strong>ul</strong>', <strong>0</strong></span>)->find(<span class="var">'<strong>li</strong>', <strong>0</strong></span>);<br>
- </div>
- </div>
- </div>
- <a name="section_access"></a>
- <h2>How to access the HTML element's attributes?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_access">
- <ul>
- <li><a href="#frag_access_attr"><span>Get, Set and Remove attributes</span></a></li>
- <li><a href="#frag_access_special"><span>Magic attributes</span></a></li>
- <li><a href="#frag_access_tips"><span>Tips</span></a></li>
- </ul>
- <div id="frag_access_attr">
- <div class="code">
- <span class="comment">// <strong>Get</strong> a attribute ( If the attribute is <strong>non-value</strong> attribute (eg. checked, selected...), it will returns <strong>true</strong> or <strong>false</strong>)</span><br>
- $value = $e-><strong>href</strong>;<br>
- <br>
- <span class="comment">// <strong>Set</strong> a attribute(If the attribute is <strong>non-value</strong> attribute (eg. checked, selected...), set it's value as <strong>true</strong> or <strong>false</strong>)</span><br>
- $e-><strong>href</strong> = <span class="var">'my link'</span>;<br>
- <br>
- <span class="comment">// <strong>Remove</strong> a attribute, set it's value as null! </span><br>
- $e-><strong>href</strong> = <strong><span class="var">null</span></strong>;<br>
- <br>
- <span class="comment">// <strong>Determine</strong> whether a attribute exist?</span> <br>
- if(isset($e-><strong>href</strong>)) <br>
- echo <span class="var">'href exist!'</span>;<br>
- </div>
- </div>
- <div id="frag_access_special">
- <div class="code"> <span class="comment">// Example</span><br>
- <span class="hl-var">$</span><span class="hl-code">html = </span>str_get_html<span class="hl-brackets">(</span><span class="var">"<div>foo <b>bar</b></div>"</span><span class="hl-brackets">)</span><span class="hl-code">;</span> <br>
- $e = $html->find(<span class="var">"div"</span>, <span class="var">0</span>);<br>
- <br>
- echo $e-><strong>tag</strong>; <span class="comment">// Returns: " <strong>div</strong>"</span><br>
- echo $e-><strong>outertext</strong>; <span class="comment">// Returns: " <strong><div>foo <b>bar</b></div></strong>"</span><br>
- echo $e-><strong>innertext</strong>; <span class="comment">// Returns: " <strong>foo <b>bar</b></strong>"</span><br>
- echo $e-><strong>plaintext</strong>; <span class="comment">// Returns: " <strong>foo </strong><strong>bar</strong>"<br>
- <br>
- </span>
- <table cellspacing="1" cellpadding="1">
- <tr bgcolor="#CCCCCC">
- <th width="25%">Attribute Name</th>
- <th width="75%">Usage</th>
- </tr>
- <tr>
- <td>$e-><strong>tag</strong></td>
- <td>Read or write the <strong>tag name</strong> of element.</td>
- </tr>
- <tr>
- <td>$e-><strong>outertext</strong></td>
- <td>Read or write the <strong>outer HTML text </strong> of element.</td>
- </tr>
- <tr>
- <td>$e-><strong>innertext</strong></td>
- <td>Read or write the <strong>inner HTML text </strong> of element.</td>
- </tr>
- <tr>
- <td>$e-><strong>plaintext</strong></td>
- <td>Read or write the <strong>plain text </strong> of element.</td>
- </tr>
- </table>
- </div>
- </div>
- <div id="frag_access_tips">
- <div class="code"><span class="comment">// <strong>Extract</strong> contents from HTML </span><br>
- echo <strong>$html</strong>-><strong>plaintext</strong>;<br>
- <br>
- <span class="comment">
- // <strong>Wrap</strong> a element</span><br>
- $e-><strong>outertext</strong> = <span class="var">'<div class="wrap">'</span> . $e-><strong>outertext</strong> . <span class="var">'<div></span>';<br>
- <br>
- <span class="comment">// <strong>Remove</strong> a element, set it's outertext as an empty string </span><br>
- $e-><strong>outertext</strong> = <span class="var">''</span>;<br>
- <br>
- <span class="comment">// <strong>Append</strong> a element</span><br>
- $e-><strong>outertext</strong> = $e-><strong>outertext</strong> . <span class="var">'<div>foo</span><span class="var"><div></span>';<br>
- <br>
- <span class="comment">// <strong>Insert</strong> a element</span><br>
- $e-><strong>outertext</strong> = <span class="var">'<div>foo</span><span class="var"><div></span>' . $e-><strong>outertext</strong>;<br>
- </div>
- </div>
- </div>
- <a name="section_traverse"></a>
- <h2>How to traverse the DOM tree?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_traverse">
- <ul>
- <li><a href="#frag_traverse_background"><span>Background Knowledge</span></a></li>
- <li><a href="#frag_traverse_traverse"><span>Traverse the DOM tree</span></a></li>
- </ul>
- <div id="frag_traverse_background">
- <div class="code"> <span class="comment">// If you are not so familiar with HTML DOM, check this <a href="http://php.net/manual/en/book.dom.php" target="_blank"><span class="var">link</span></a> to learn more... </span><br>
- <br>
- <span class="comment">// Example</span><br>
- echo $html-><strong>find</strong>(<span class="var">"#div1", 0</span>)-><strong>children</strong>(<span class="var">1</span>)-><strong>children</strong>(<span class="var">1</span>)-><strong>children</strong>(<span class="var">2</span>)-><span class="var">id</span>;<br>
- <span class="comment">// or</span> <br>
- echo $html-><strong>getElementById</strong>(<span class="var">"div1"</span>)-><strong>childNodes</strong>(<span class="var">1</span>)-><strong>childNodes</strong>(<span class="var">1</span>)-><strong>childNodes</strong>(<span class="var">2</span>)-><strong>getAttribute</strong>(<span class="var">'id'</span>); </div>
- </div>
- <div id="frag_traverse_traverse">
- <div class="code">You can also call methods with <a href="manual_api.htm#camel"><span class="var">Camel naming convertions</span></a>.<br>
- <table cellspacing="1" cellpadding="1">
- <tr>
- <th> Method </th>
- <th> Description</th>
- </tr>
- <tr>
- <td>
- <div class="returns">mixed</div>$e-><strong>children</strong> ( <span class="var">[int $index]</span> ) </td>
- <td>Returns the Nth <strong>child object</strong> if <strong>index</strong> is set, otherwise return an <strong>array of children</strong>. </td>
- </tr>
- <tr>
- <td>
- <div class="returns">element</div>$e-><strong>parent</strong> () </td>
- <td>Returns the <strong>parent</strong> of element. </td>
- </tr>
- <tr>
- <td>
- <div class="returns">element</div>$e-><strong>first_child</strong> () </td>
- <td>Returns the <strong>first child</strong> of element, or <strong>null</strong> if not found. </td>
- </tr>
- <tr>
- <td>
- <div class="returns">element</div>$e-><strong>last_child</strong> () </td>
- <td>Returns the <strong>last child</strong> of element, or <strong>null</strong> if not found. </td>
- </tr>
- <tr>
- <td>
- <div class="returns">element</div>$e-><strong>next_sibling</strong> () </td>
- <td>Returns the <strong>next sibling</strong> of element, or<strong> null</strong> if not found. </td>
- </tr>
- <tr>
- <td>
- <div class="returns">element</div>$e-><strong>prev_sibling</strong> () </td>
- <td>Returns the <strong>previous sibling</strong> of element, or <strong>null</strong> if not found. </td>
- </tr>
- </table>
- </div>
- </div>
-
- </div>
- <a name="section_dump"></a>
- <h2>How to dump contents of DOM object?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_dump">
- <ul>
- <li><a href="#frag_dump_quick"><span>Quick way</span></a></li>
- <li><a href="#frag_dump_oo"><span>Object-oriented way</span></a></li>
- </ul>
- <div id="frag_dump_oo">
- <div class="code"><span class="comment">// </span><span class="comment">Dumps the internal DOM tree back into string </span><br>
- $str = $html-><strong>save</strong>();<br>
- <br>
- <span class="comment">// Dumps the internal DOM tree back into a file</span> <br>
- $html-><strong>save</strong>(<span class="var">'result.htm'</span>);</div>
- </div>
- <div id="frag_dump_quick">
- <div class="code"><span class="comment">// </span><span class="comment">Dumps the internal DOM tree back into string </span><br>
- $str = $html;<br>
- <br>
- <span class="comment">// Print it!</span><br>
- echo $html; <br>
- </div>
- </div>
- </div>
- <a name="section_callback"></a>
- <h2>How to customize the parsing behavior?</h2>
- <a class="top" href="#top">Top</a>
- <div id="container_callback">
- <ul>
- <li><a href="#frag_callback"><span>Callback function</span></a></li>
- </ul>
- <div id="frag_callback">
- <div class="code"><span class="comment">// Write a function with parameter "<strong>$element</strong>"</span><br>
- function my_callback(<span class="var">$element</span>) {<br>
- <span class="comment">// Hide all <b> tags </span><br>
- if ($element->tag==<span class="var">'b'</span>)<br>
- $element->outertext = '';<br>
- } <br>
- <br>
- <span class="comment">// Register the callback function with it's <strong>function name</strong></span><br>
- $html-><strong>set_callback</strong>(<span class="var">'my_callback'</span>);<br>
- <br>
- <span class="comment">// Callback function will be invoked while dumping</span><br>
- echo $html;
- </div>
- </div>
- </div>
- <div><br>
- Author: S.C. Chen (me578022@gmail.com)<br>
- Original idea is from Jose Solorzano's <a href="http://php-html.sourceforge.net/">HTML Parser for PHP 4</a>. <br>
- Contributions by: Contributions by: Yousuke Kumakura, Vadim Voituk, Antcs<br>
- </div>
- </div>
- </body>
- </html>
- <!--$Rev: 165 $-->
|