| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 | <?php/** * Injector that auto paragraphs text in the root node based on * double-spacing. * @todo Ensure all states are unit tested, including variations as well. * @todo Make a graph of the flow control for this Injector. */class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector{    /**     * @type string     */    public $name = 'AutoParagraph';    /**     * @type array     */    public $needed = array('p');    /**     * @return HTMLPurifier_Token_Start     */    private function _pStart()    {        $par = new HTMLPurifier_Token_Start('p');        $par->armor['MakeWellFormed_TagClosedError'] = true;        return $par;    }    /**     * @param HTMLPurifier_Token_Text $token     */    public function handleText(&$token)    {        $text = $token->data;        // Does the current parent allow <p> tags?        if ($this->allowsElement('p')) {            if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {                // Note that we have differing behavior when dealing with text                // in the anonymous root node, or a node inside the document.                // If the text as a double-newline, the treatment is the same;                // if it doesn't, see the next if-block if you're in the document.                $i = $nesting = null;                if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {                    // State 1.1: ...    ^ (whitespace, then document end)                    //               ----                    // This is a degenerate case                } else {                    if (!$token->is_whitespace || $this->_isInline($current)) {                        // State 1.2: PAR1                        //            ----                        // State 1.3: PAR1\n\nPAR2                        //            ------------                        // State 1.4: <div>PAR1\n\nPAR2 (see State 2)                        //                 ------------                        $token = array($this->_pStart());                        $this->_splitText($text, $token);                    } else {                        // State 1.5: \n<hr />                        //            --                    }                }            } else {                // State 2:   <div>PAR1... (similar to 1.4)                //                 ----                // We're in an element that allows paragraph tags, but we're not                // sure if we're going to need them.                if ($this->_pLookAhead()) {                    // State 2.1: <div>PAR1<b>PAR1\n\nPAR2                    //                 ----                    // Note: This will always be the first child, since any                    // previous inline element would have triggered this very                    // same routine, and found the double newline. One possible                    // exception would be a comment.                    $token = array($this->_pStart(), $token);                } else {                    // State 2.2.1: <div>PAR1<div>                    //                   ----                    // State 2.2.2: <div>PAR1<b>PAR1</b></div>                    //                   ----                }            }            // Is the current parent a <p> tag?        } elseif (!empty($this->currentNesting) &&            $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {            // State 3.1: ...<p>PAR1            //                  ----            // State 3.2: ...<p>PAR1\n\nPAR2            //                  ------------            $token = array();            $this->_splitText($text, $token);            // Abort!        } else {            // State 4.1: ...<b>PAR1            //                  ----            // State 4.2: ...<b>PAR1\n\nPAR2            //                  ------------        }    }    /**     * @param HTMLPurifier_Token $token     */    public function handleElement(&$token)    {        // We don't have to check if we're already in a <p> tag for block        // tokens, because the tag would have been autoclosed by MakeWellFormed.        if ($this->allowsElement('p')) {            if (!empty($this->currentNesting)) {                if ($this->_isInline($token)) {                    // State 1: <div>...<b>                    //                  ---                    // Check if this token is adjacent to the parent token                    // (seek backwards until token isn't whitespace)                    $i = null;                    $this->backward($i, $prev);                    if (!$prev instanceof HTMLPurifier_Token_Start) {                        // Token wasn't adjacent                        if ($prev instanceof HTMLPurifier_Token_Text &&                            substr($prev->data, -2) === "\n\n"                        ) {                            // State 1.1.4: <div><p>PAR1</p>\n\n<b>                            //                                  ---                            // Quite frankly, this should be handled by splitText                            $token = array($this->_pStart(), $token);                        } else {                            // State 1.1.1: <div><p>PAR1</p><b>                            //                              ---                            // State 1.1.2: <div><br /><b>                            //                         ---                            // State 1.1.3: <div>PAR<b>                            //                      ---                        }                    } else {                        // State 1.2.1: <div><b>                        //                   ---                        // Lookahead to see if <p> is needed.                        if ($this->_pLookAhead()) {                            // State 1.3.1: <div><b>PAR1\n\nPAR2                            //                   ---                            $token = array($this->_pStart(), $token);                        } else {                            // State 1.3.2: <div><b>PAR1</b></div>                            //                   ---                            // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>                            //                   ---                        }                    }                } else {                    // State 2.3: ...<div>                    //               -----                }            } else {                if ($this->_isInline($token)) {                    // State 3.1: <b>                    //            ---                    // This is where the {p} tag is inserted, not reflected in                    // inputTokens yet, however.                    $token = array($this->_pStart(), $token);                } else {                    // State 3.2: <div>                    //            -----                }                $i = null;                if ($this->backward($i, $prev)) {                    if (!$prev instanceof HTMLPurifier_Token_Text) {                        // State 3.1.1: ...</p>{p}<b>                        //                        ---                        // State 3.2.1: ...</p><div>                        //                     -----                        if (!is_array($token)) {                            $token = array($token);                        }                        array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));                    } else {                        // State 3.1.2: ...</p>\n\n{p}<b>                        //                            ---                        // State 3.2.2: ...</p>\n\n<div>                        //                         -----                        // Note: PAR<ELEM> cannot occur because PAR would have been                        // wrapped in <p> tags.                    }                }            }        } else {            // State 2.2: <ul><li>            //                ----            // State 2.4: <p><b>            //               ---        }    }    /**     * Splits up a text in paragraph tokens and appends them     * to the result stream that will replace the original     * @param string $data String text data that will be processed     *    into paragraphs     * @param HTMLPurifier_Token[] $result Reference to array of tokens that the     *    tags will be appended onto     */    private function _splitText($data, &$result)    {        $raw_paragraphs = explode("\n\n", $data);        $paragraphs = array(); // without empty paragraphs        $needs_start = false;        $needs_end = false;        $c = count($raw_paragraphs);        if ($c == 1) {            // There were no double-newlines, abort quickly. In theory this            // should never happen.            $result[] = new HTMLPurifier_Token_Text($data);            return;        }        for ($i = 0; $i < $c; $i++) {            $par = $raw_paragraphs[$i];            if (trim($par) !== '') {                $paragraphs[] = $par;            } else {                if ($i == 0) {                    // Double newline at the front                    if (empty($result)) {                        // The empty result indicates that the AutoParagraph                        // injector did not add any start paragraph tokens.                        // This means that we have been in a paragraph for                        // a while, and the newline means we should start a new one.                        $result[] = new HTMLPurifier_Token_End('p');                        $result[] = new HTMLPurifier_Token_Text("\n\n");                        // However, the start token should only be added if                        // there is more processing to be done (i.e. there are                        // real paragraphs in here). If there are none, the                        // next start paragraph tag will be handled by the                        // next call to the injector                        $needs_start = true;                    } else {                        // We just started a new paragraph!                        // Reinstate a double-newline for presentation's sake, since                        // it was in the source code.                        array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));                    }                } elseif ($i + 1 == $c) {                    // Double newline at the end                    // There should be a trailing </p> when we're finally done.                    $needs_end = true;                }            }        }        // Check if this was just a giant blob of whitespace. Move this earlier,        // perhaps?        if (empty($paragraphs)) {            return;        }        // Add the start tag indicated by \n\n at the beginning of $data        if ($needs_start) {            $result[] = $this->_pStart();        }        // Append the paragraphs onto the result        foreach ($paragraphs as $par) {            $result[] = new HTMLPurifier_Token_Text($par);            $result[] = new HTMLPurifier_Token_End('p');            $result[] = new HTMLPurifier_Token_Text("\n\n");            $result[] = $this->_pStart();        }        // Remove trailing start token; Injector will handle this later if        // it was indeed needed. This prevents from needing to do a lookahead,        // at the cost of a lookbehind later.        array_pop($result);        // If there is no need for an end tag, remove all of it and let        // MakeWellFormed close it later.        if (!$needs_end) {            array_pop($result); // removes \n\n            array_pop($result); // removes </p>        }    }    /**     * Returns true if passed token is inline (and, ergo, allowed in     * paragraph tags)     * @param HTMLPurifier_Token $token     * @return bool     */    private function _isInline($token)    {        return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);    }    /**     * Looks ahead in the token list and determines whether or not we need     * to insert a <p> tag.     * @return bool     */    private function _pLookAhead()    {        if ($this->currentToken instanceof HTMLPurifier_Token_Start) {            $nesting = 1;        } else {            $nesting = 0;        }        $ok = false;        $i = null;        while ($this->forwardUntilEndToken($i, $current, $nesting)) {            $result = $this->_checkNeedsP($current);            if ($result !== null) {                $ok = $result;                break;            }        }        return $ok;    }    /**     * Determines if a particular token requires an earlier inline token     * to get a paragraph. This should be used with _forwardUntilEndToken     * @param HTMLPurifier_Token $current     * @return bool     */    private function _checkNeedsP($current)    {        if ($current instanceof HTMLPurifier_Token_Start) {            if (!$this->_isInline($current)) {                // <div>PAR1<div>                //      ----                // Terminate early, since we hit a block element                return false;            }        } elseif ($current instanceof HTMLPurifier_Token_Text) {            if (strpos($current->data, "\n\n") !== false) {                // <div>PAR1<b>PAR1\n\nPAR2                //      ----                return true;            } else {                // <div>PAR1<b>PAR1...                //      ----            }        }        return null;    }}// vim: et sw=4 sts=4
 |