Yaffs site version 1.1
[yaffs-website] / vendor / ezyang / htmlpurifier / library / HTMLPurifier / Injector / AutoParagraph.php
1 <?php
2
3 /**
4  * Injector that auto paragraphs text in the root node based on
5  * double-spacing.
6  * @todo Ensure all states are unit tested, including variations as well.
7  * @todo Make a graph of the flow control for this Injector.
8  */
9 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
10 {
11     /**
12      * @type string
13      */
14     public $name = 'AutoParagraph';
15
16     /**
17      * @type array
18      */
19     public $needed = array('p');
20
21     /**
22      * @return HTMLPurifier_Token_Start
23      */
24     private function _pStart()
25     {
26         $par = new HTMLPurifier_Token_Start('p');
27         $par->armor['MakeWellFormed_TagClosedError'] = true;
28         return $par;
29     }
30
31     /**
32      * @param HTMLPurifier_Token_Text $token
33      */
34     public function handleText(&$token)
35     {
36         $text = $token->data;
37         // Does the current parent allow <p> tags?
38         if ($this->allowsElement('p')) {
39             if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
40                 // Note that we have differing behavior when dealing with text
41                 // in the anonymous root node, or a node inside the document.
42                 // If the text as a double-newline, the treatment is the same;
43                 // if it doesn't, see the next if-block if you're in the document.
44
45                 $i = $nesting = null;
46                 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
47                     // State 1.1: ...    ^ (whitespace, then document end)
48                     //               ----
49                     // This is a degenerate case
50                 } else {
51                     if (!$token->is_whitespace || $this->_isInline($current)) {
52                         // State 1.2: PAR1
53                         //            ----
54
55                         // State 1.3: PAR1\n\nPAR2
56                         //            ------------
57
58                         // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
59                         //                 ------------
60                         $token = array($this->_pStart());
61                         $this->_splitText($text, $token);
62                     } else {
63                         // State 1.5: \n<hr />
64                         //            --
65                     }
66                 }
67             } else {
68                 // State 2:   <div>PAR1... (similar to 1.4)
69                 //                 ----
70
71                 // We're in an element that allows paragraph tags, but we're not
72                 // sure if we're going to need them.
73                 if ($this->_pLookAhead()) {
74                     // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
75                     //                 ----
76                     // Note: This will always be the first child, since any
77                     // previous inline element would have triggered this very
78                     // same routine, and found the double newline. One possible
79                     // exception would be a comment.
80                     $token = array($this->_pStart(), $token);
81                 } else {
82                     // State 2.2.1: <div>PAR1<div>
83                     //                   ----
84
85                     // State 2.2.2: <div>PAR1<b>PAR1</b></div>
86                     //                   ----
87                 }
88             }
89             // Is the current parent a <p> tag?
90         } elseif (!empty($this->currentNesting) &&
91             $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
92             // State 3.1: ...<p>PAR1
93             //                  ----
94
95             // State 3.2: ...<p>PAR1\n\nPAR2
96             //                  ------------
97             $token = array();
98             $this->_splitText($text, $token);
99             // Abort!
100         } else {
101             // State 4.1: ...<b>PAR1
102             //                  ----
103
104             // State 4.2: ...<b>PAR1\n\nPAR2
105             //                  ------------
106         }
107     }
108
109     /**
110      * @param HTMLPurifier_Token $token
111      */
112     public function handleElement(&$token)
113     {
114         // We don't have to check if we're already in a <p> tag for block
115         // tokens, because the tag would have been autoclosed by MakeWellFormed.
116         if ($this->allowsElement('p')) {
117             if (!empty($this->currentNesting)) {
118                 if ($this->_isInline($token)) {
119                     // State 1: <div>...<b>
120                     //                  ---
121                     // Check if this token is adjacent to the parent token
122                     // (seek backwards until token isn't whitespace)
123                     $i = null;
124                     $this->backward($i, $prev);
125
126                     if (!$prev instanceof HTMLPurifier_Token_Start) {
127                         // Token wasn't adjacent
128                         if ($prev instanceof HTMLPurifier_Token_Text &&
129                             substr($prev->data, -2) === "\n\n"
130                         ) {
131                             // State 1.1.4: <div><p>PAR1</p>\n\n<b>
132                             //                                  ---
133                             // Quite frankly, this should be handled by splitText
134                             $token = array($this->_pStart(), $token);
135                         } else {
136                             // State 1.1.1: <div><p>PAR1</p><b>
137                             //                              ---
138                             // State 1.1.2: <div><br /><b>
139                             //                         ---
140                             // State 1.1.3: <div>PAR<b>
141                             //                      ---
142                         }
143                     } else {
144                         // State 1.2.1: <div><b>
145                         //                   ---
146                         // Lookahead to see if <p> is needed.
147                         if ($this->_pLookAhead()) {
148                             // State 1.3.1: <div><b>PAR1\n\nPAR2
149                             //                   ---
150                             $token = array($this->_pStart(), $token);
151                         } else {
152                             // State 1.3.2: <div><b>PAR1</b></div>
153                             //                   ---
154
155                             // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
156                             //                   ---
157                         }
158                     }
159                 } else {
160                     // State 2.3: ...<div>
161                     //               -----
162                 }
163             } else {
164                 if ($this->_isInline($token)) {
165                     // State 3.1: <b>
166                     //            ---
167                     // This is where the {p} tag is inserted, not reflected in
168                     // inputTokens yet, however.
169                     $token = array($this->_pStart(), $token);
170                 } else {
171                     // State 3.2: <div>
172                     //            -----
173                 }
174
175                 $i = null;
176                 if ($this->backward($i, $prev)) {
177                     if (!$prev instanceof HTMLPurifier_Token_Text) {
178                         // State 3.1.1: ...</p>{p}<b>
179                         //                        ---
180                         // State 3.2.1: ...</p><div>
181                         //                     -----
182                         if (!is_array($token)) {
183                             $token = array($token);
184                         }
185                         array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
186                     } else {
187                         // State 3.1.2: ...</p>\n\n{p}<b>
188                         //                            ---
189                         // State 3.2.2: ...</p>\n\n<div>
190                         //                         -----
191                         // Note: PAR<ELEM> cannot occur because PAR would have been
192                         // wrapped in <p> tags.
193                     }
194                 }
195             }
196         } else {
197             // State 2.2: <ul><li>
198             //                ----
199             // State 2.4: <p><b>
200             //               ---
201         }
202     }
203
204     /**
205      * Splits up a text in paragraph tokens and appends them
206      * to the result stream that will replace the original
207      * @param string $data String text data that will be processed
208      *    into paragraphs
209      * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
210      *    tags will be appended onto
211      */
212     private function _splitText($data, &$result)
213     {
214         $raw_paragraphs = explode("\n\n", $data);
215         $paragraphs = array(); // without empty paragraphs
216         $needs_start = false;
217         $needs_end = false;
218
219         $c = count($raw_paragraphs);
220         if ($c == 1) {
221             // There were no double-newlines, abort quickly. In theory this
222             // should never happen.
223             $result[] = new HTMLPurifier_Token_Text($data);
224             return;
225         }
226         for ($i = 0; $i < $c; $i++) {
227             $par = $raw_paragraphs[$i];
228             if (trim($par) !== '') {
229                 $paragraphs[] = $par;
230             } else {
231                 if ($i == 0) {
232                     // Double newline at the front
233                     if (empty($result)) {
234                         // The empty result indicates that the AutoParagraph
235                         // injector did not add any start paragraph tokens.
236                         // This means that we have been in a paragraph for
237                         // a while, and the newline means we should start a new one.
238                         $result[] = new HTMLPurifier_Token_End('p');
239                         $result[] = new HTMLPurifier_Token_Text("\n\n");
240                         // However, the start token should only be added if
241                         // there is more processing to be done (i.e. there are
242                         // real paragraphs in here). If there are none, the
243                         // next start paragraph tag will be handled by the
244                         // next call to the injector
245                         $needs_start = true;
246                     } else {
247                         // We just started a new paragraph!
248                         // Reinstate a double-newline for presentation's sake, since
249                         // it was in the source code.
250                         array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
251                     }
252                 } elseif ($i + 1 == $c) {
253                     // Double newline at the end
254                     // There should be a trailing </p> when we're finally done.
255                     $needs_end = true;
256                 }
257             }
258         }
259
260         // Check if this was just a giant blob of whitespace. Move this earlier,
261         // perhaps?
262         if (empty($paragraphs)) {
263             return;
264         }
265
266         // Add the start tag indicated by \n\n at the beginning of $data
267         if ($needs_start) {
268             $result[] = $this->_pStart();
269         }
270
271         // Append the paragraphs onto the result
272         foreach ($paragraphs as $par) {
273             $result[] = new HTMLPurifier_Token_Text($par);
274             $result[] = new HTMLPurifier_Token_End('p');
275             $result[] = new HTMLPurifier_Token_Text("\n\n");
276             $result[] = $this->_pStart();
277         }
278
279         // Remove trailing start token; Injector will handle this later if
280         // it was indeed needed. This prevents from needing to do a lookahead,
281         // at the cost of a lookbehind later.
282         array_pop($result);
283
284         // If there is no need for an end tag, remove all of it and let
285         // MakeWellFormed close it later.
286         if (!$needs_end) {
287             array_pop($result); // removes \n\n
288             array_pop($result); // removes </p>
289         }
290     }
291
292     /**
293      * Returns true if passed token is inline (and, ergo, allowed in
294      * paragraph tags)
295      * @param HTMLPurifier_Token $token
296      * @return bool
297      */
298     private function _isInline($token)
299     {
300         return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
301     }
302
303     /**
304      * Looks ahead in the token list and determines whether or not we need
305      * to insert a <p> tag.
306      * @return bool
307      */
308     private function _pLookAhead()
309     {
310         if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
311             $nesting = 1;
312         } else {
313             $nesting = 0;
314         }
315         $ok = false;
316         $i = null;
317         while ($this->forwardUntilEndToken($i, $current, $nesting)) {
318             $result = $this->_checkNeedsP($current);
319             if ($result !== null) {
320                 $ok = $result;
321                 break;
322             }
323         }
324         return $ok;
325     }
326
327     /**
328      * Determines if a particular token requires an earlier inline token
329      * to get a paragraph. This should be used with _forwardUntilEndToken
330      * @param HTMLPurifier_Token $current
331      * @return bool
332      */
333     private function _checkNeedsP($current)
334     {
335         if ($current instanceof HTMLPurifier_Token_Start) {
336             if (!$this->_isInline($current)) {
337                 // <div>PAR1<div>
338                 //      ----
339                 // Terminate early, since we hit a block element
340                 return false;
341             }
342         } elseif ($current instanceof HTMLPurifier_Token_Text) {
343             if (strpos($current->data, "\n\n") !== false) {
344                 // <div>PAR1<b>PAR1\n\nPAR2
345                 //      ----
346                 return true;
347             } else {
348                 // <div>PAR1<b>PAR1...
349                 //      ----
350             }
351         }
352         return null;
353     }
354 }
355
356 // vim: et sw=4 sts=4