scratch – Blame information for rev
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
87 | office | 1 | <?php |
2 | |||
3 | /* |
||
4 | * This file is part of the Symfony package. |
||
5 | * |
||
6 | * (c) Fabien Potencier <fabien@symfony.com> |
||
7 | * |
||
8 | * For the full copyright and license information, please view the LICENSE |
||
9 | * file that was distributed with this source code. |
||
10 | */ |
||
11 | |||
12 | namespace Symfony\Component\DomCrawler; |
||
13 | |||
14 | use Symfony\Component\CssSelector\CssSelectorConverter; |
||
15 | |||
16 | /** |
||
17 | * Crawler eases navigation of a list of \DOMNode objects. |
||
18 | * |
||
19 | * @author Fabien Potencier <fabien@symfony.com> |
||
20 | */ |
||
21 | class Crawler extends \SplObjectStorage |
||
22 | { |
||
23 | /** |
||
24 | * @var string The current URI |
||
25 | */ |
||
26 | protected $uri; |
||
27 | |||
28 | /** |
||
29 | * @var string The default namespace prefix to be used with XPath and CSS expressions |
||
30 | */ |
||
31 | private $defaultNamespacePrefix = 'default'; |
||
32 | |||
33 | /** |
||
34 | * @var array A map of manually registered namespaces |
||
35 | */ |
||
36 | private $namespaces = array(); |
||
37 | |||
38 | /** |
||
39 | * @var string The base href value |
||
40 | */ |
||
41 | private $baseHref; |
||
42 | |||
43 | /** |
||
44 | * @var \DOMDocument|null |
||
45 | */ |
||
46 | private $document; |
||
47 | |||
48 | /** |
||
49 | * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). |
||
50 | * |
||
51 | * @var bool |
||
52 | */ |
||
53 | private $isHtml = true; |
||
54 | |||
55 | /** |
||
56 | * Constructor. |
||
57 | * |
||
58 | * @param mixed $node A Node to use as the base for the crawling |
||
59 | * @param string $currentUri The current URI |
||
60 | * @param string $baseHref The base href value |
||
61 | */ |
||
62 | public function __construct($node = null, $currentUri = null, $baseHref = null) |
||
63 | { |
||
64 | $this->uri = $currentUri; |
||
65 | $this->baseHref = $baseHref ?: $currentUri; |
||
66 | |||
67 | $this->add($node); |
||
68 | } |
||
69 | |||
70 | /** |
||
71 | * Removes all the nodes. |
||
72 | */ |
||
73 | public function clear() |
||
74 | { |
||
75 | parent::removeAll($this); |
||
76 | $this->document = null; |
||
77 | } |
||
78 | |||
79 | /** |
||
80 | * Adds a node to the current list of nodes. |
||
81 | * |
||
82 | * This method uses the appropriate specialized add*() method based |
||
83 | * on the type of the argument. |
||
84 | * |
||
85 | * @param \DOMNodeList|\DOMNode|array|string|null $node A node |
||
86 | * |
||
87 | * @throws \InvalidArgumentException When node is not the expected type. |
||
88 | */ |
||
89 | public function add($node) |
||
90 | { |
||
91 | if ($node instanceof \DOMNodeList) { |
||
92 | $this->addNodeList($node); |
||
93 | } elseif ($node instanceof \DOMNode) { |
||
94 | $this->addNode($node); |
||
95 | } elseif (is_array($node)) { |
||
96 | $this->addNodes($node); |
||
97 | } elseif (is_string($node)) { |
||
98 | $this->addContent($node); |
||
99 | } elseif (null !== $node) { |
||
100 | throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node))); |
||
101 | } |
||
102 | } |
||
103 | |||
104 | /** |
||
105 | * Adds HTML/XML content. |
||
106 | * |
||
107 | * If the charset is not set via the content type, it is assumed |
||
108 | * to be ISO-8859-1, which is the default charset defined by the |
||
109 | * HTTP 1.1 specification. |
||
110 | * |
||
111 | * @param string $content A string to parse as HTML/XML |
||
112 | * @param null|string $type The content type of the string |
||
113 | */ |
||
114 | public function addContent($content, $type = null) |
||
115 | { |
||
116 | if (empty($type)) { |
||
117 | $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html'; |
||
118 | } |
||
119 | |||
120 | // DOM only for HTML/XML content |
||
121 | if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) { |
||
122 | return; |
||
123 | } |
||
124 | |||
125 | $charset = null; |
||
126 | if (false !== $pos = stripos($type, 'charset=')) { |
||
127 | $charset = substr($type, $pos + 8); |
||
128 | if (false !== $pos = strpos($charset, ';')) { |
||
129 | $charset = substr($charset, 0, $pos); |
||
130 | } |
||
131 | } |
||
132 | |||
133 | // http://www.w3.org/TR/encoding/#encodings |
||
134 | // http://www.w3.org/TR/REC-xml/#NT-EncName |
||
135 | if (null === $charset && |
||
136 | preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { |
||
137 | $charset = $matches[1]; |
||
138 | } |
||
139 | |||
140 | if (null === $charset) { |
||
141 | $charset = 'ISO-8859-1'; |
||
142 | } |
||
143 | |||
144 | if ('x' === $xmlMatches[1]) { |
||
145 | $this->addXmlContent($content, $charset); |
||
146 | } else { |
||
147 | $this->addHtmlContent($content, $charset); |
||
148 | } |
||
149 | } |
||
150 | |||
151 | /** |
||
152 | * Adds an HTML content to the list of nodes. |
||
153 | * |
||
154 | * The libxml errors are disabled when the content is parsed. |
||
155 | * |
||
156 | * If you want to get parsing errors, be sure to enable |
||
157 | * internal errors via libxml_use_internal_errors(true) |
||
158 | * and then, get the errors via libxml_get_errors(). Be |
||
159 | * sure to clear errors with libxml_clear_errors() afterward. |
||
160 | * |
||
161 | * @param string $content The HTML content |
||
162 | * @param string $charset The charset |
||
163 | */ |
||
164 | public function addHtmlContent($content, $charset = 'UTF-8') |
||
165 | { |
||
166 | $internalErrors = libxml_use_internal_errors(true); |
||
167 | $disableEntities = libxml_disable_entity_loader(true); |
||
168 | |||
169 | $dom = new \DOMDocument('1.0', $charset); |
||
170 | $dom->validateOnParse = true; |
||
171 | |||
172 | set_error_handler(function () { throw new \Exception(); }); |
||
173 | |||
174 | try { |
||
175 | // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() |
||
176 | $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); |
||
177 | } catch (\Exception $e) { |
||
178 | } |
||
179 | |||
180 | restore_error_handler(); |
||
181 | |||
182 | if ('' !== trim($content)) { |
||
183 | @$dom->loadHTML($content); |
||
184 | } |
||
185 | |||
186 | libxml_use_internal_errors($internalErrors); |
||
187 | libxml_disable_entity_loader($disableEntities); |
||
188 | |||
189 | $this->addDocument($dom); |
||
190 | |||
191 | $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href')); |
||
192 | |||
193 | $baseHref = current($base); |
||
194 | if (count($base) && !empty($baseHref)) { |
||
195 | if ($this->baseHref) { |
||
196 | $linkNode = $dom->createElement('a'); |
||
197 | $linkNode->setAttribute('href', $baseHref); |
||
198 | $link = new Link($linkNode, $this->baseHref); |
||
199 | $this->baseHref = $link->getUri(); |
||
200 | } else { |
||
201 | $this->baseHref = $baseHref; |
||
202 | } |
||
203 | } |
||
204 | } |
||
205 | |||
206 | /** |
||
207 | * Adds an XML content to the list of nodes. |
||
208 | * |
||
209 | * The libxml errors are disabled when the content is parsed. |
||
210 | * |
||
211 | * If you want to get parsing errors, be sure to enable |
||
212 | * internal errors via libxml_use_internal_errors(true) |
||
213 | * and then, get the errors via libxml_get_errors(). Be |
||
214 | * sure to clear errors with libxml_clear_errors() afterward. |
||
215 | * |
||
216 | * @param string $content The XML content |
||
217 | * @param string $charset The charset |
||
218 | * @param int $options Bitwise OR of the libxml option constants |
||
219 | * LIBXML_PARSEHUGE is dangerous, see |
||
220 | * http://symfony.com/blog/security-release-symfony-2-0-17-released |
||
221 | */ |
||
222 | public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET) |
||
223 | { |
||
224 | // remove the default namespace if it's the only namespace to make XPath expressions simpler |
||
225 | if (!preg_match('/xmlns:/', $content)) { |
||
226 | $content = str_replace('xmlns', 'ns', $content); |
||
227 | } |
||
228 | |||
229 | $internalErrors = libxml_use_internal_errors(true); |
||
230 | $disableEntities = libxml_disable_entity_loader(true); |
||
231 | |||
232 | $dom = new \DOMDocument('1.0', $charset); |
||
233 | $dom->validateOnParse = true; |
||
234 | |||
235 | if ('' !== trim($content)) { |
||
236 | @$dom->loadXML($content, $options); |
||
237 | } |
||
238 | |||
239 | libxml_use_internal_errors($internalErrors); |
||
240 | libxml_disable_entity_loader($disableEntities); |
||
241 | |||
242 | $this->addDocument($dom); |
||
243 | |||
244 | $this->isHtml = false; |
||
245 | } |
||
246 | |||
247 | /** |
||
248 | * Adds a \DOMDocument to the list of nodes. |
||
249 | * |
||
250 | * @param \DOMDocument $dom A \DOMDocument instance |
||
251 | */ |
||
252 | public function addDocument(\DOMDocument $dom) |
||
253 | { |
||
254 | if ($dom->documentElement) { |
||
255 | $this->addNode($dom->documentElement); |
||
256 | } |
||
257 | } |
||
258 | |||
259 | /** |
||
260 | * Adds a \DOMNodeList to the list of nodes. |
||
261 | * |
||
262 | * @param \DOMNodeList $nodes A \DOMNodeList instance |
||
263 | */ |
||
264 | public function addNodeList(\DOMNodeList $nodes) |
||
265 | { |
||
266 | foreach ($nodes as $node) { |
||
267 | if ($node instanceof \DOMNode) { |
||
268 | $this->addNode($node); |
||
269 | } |
||
270 | } |
||
271 | } |
||
272 | |||
273 | /** |
||
274 | * Adds an array of \DOMNode instances to the list of nodes. |
||
275 | * |
||
276 | * @param \DOMNode[] $nodes An array of \DOMNode instances |
||
277 | */ |
||
278 | public function addNodes(array $nodes) |
||
279 | { |
||
280 | foreach ($nodes as $node) { |
||
281 | $this->add($node); |
||
282 | } |
||
283 | } |
||
284 | |||
285 | /** |
||
286 | * Adds a \DOMNode instance to the list of nodes. |
||
287 | * |
||
288 | * @param \DOMNode $node A \DOMNode instance |
||
289 | */ |
||
290 | public function addNode(\DOMNode $node) |
||
291 | { |
||
292 | if ($node instanceof \DOMDocument) { |
||
293 | $node = $node->documentElement; |
||
294 | } |
||
295 | |||
296 | if (null !== $this->document && $this->document !== $node->ownerDocument) { |
||
297 | @trigger_error('Attaching DOM nodes from multiple documents in a Crawler is deprecated as of 2.8 and will be forbidden in 3.0.', E_USER_DEPRECATED); |
||
298 | } |
||
299 | |||
300 | if (null === $this->document) { |
||
301 | $this->document = $node->ownerDocument; |
||
302 | } |
||
303 | |||
304 | parent::attach($node); |
||
305 | } |
||
306 | |||
307 | // Serializing and unserializing a crawler creates DOM objects in a corrupted state. DOM elements are not properly serializable. |
||
308 | public function unserialize($serialized) |
||
309 | { |
||
310 | throw new \BadMethodCallException('A Crawler cannot be serialized.'); |
||
311 | } |
||
312 | |||
313 | public function serialize() |
||
314 | { |
||
315 | throw new \BadMethodCallException('A Crawler cannot be serialized.'); |
||
316 | } |
||
317 | |||
318 | /** |
||
319 | * Returns a node given its position in the node list. |
||
320 | * |
||
321 | * @param int $position The position |
||
322 | * |
||
323 | * @return self |
||
324 | */ |
||
325 | public function eq($position) |
||
326 | { |
||
327 | foreach ($this as $i => $node) { |
||
328 | if ($i == $position) { |
||
329 | return $this->createSubCrawler($node); |
||
330 | } |
||
331 | } |
||
332 | |||
333 | return $this->createSubCrawler(null); |
||
334 | } |
||
335 | |||
336 | /** |
||
337 | * Calls an anonymous function on each node of the list. |
||
338 | * |
||
339 | * The anonymous function receives the position and the node wrapped |
||
340 | * in a Crawler instance as arguments. |
||
341 | * |
||
342 | * Example: |
||
343 | * |
||
344 | * $crawler->filter('h1')->each(function ($node, $i) { |
||
345 | * return $node->text(); |
||
346 | * }); |
||
347 | * |
||
348 | * @param \Closure $closure An anonymous function |
||
349 | * |
||
350 | * @return array An array of values returned by the anonymous function |
||
351 | */ |
||
352 | public function each(\Closure $closure) |
||
353 | { |
||
354 | $data = array(); |
||
355 | foreach ($this as $i => $node) { |
||
356 | $data[] = $closure($this->createSubCrawler($node), $i); |
||
357 | } |
||
358 | |||
359 | return $data; |
||
360 | } |
||
361 | |||
362 | /** |
||
363 | * Slices the list of nodes by $offset and $length. |
||
364 | * |
||
365 | * @param int $offset |
||
366 | * @param int $length |
||
367 | * |
||
368 | * @return self |
||
369 | */ |
||
370 | public function slice($offset = 0, $length = -1) |
||
371 | { |
||
372 | return $this->createSubCrawler(iterator_to_array(new \LimitIterator($this, $offset, $length))); |
||
373 | } |
||
374 | |||
375 | /** |
||
376 | * Reduces the list of nodes by calling an anonymous function. |
||
377 | * |
||
378 | * To remove a node from the list, the anonymous function must return false. |
||
379 | * |
||
380 | * @param \Closure $closure An anonymous function |
||
381 | * |
||
382 | * @return self |
||
383 | */ |
||
384 | public function reduce(\Closure $closure) |
||
385 | { |
||
386 | $nodes = array(); |
||
387 | foreach ($this as $i => $node) { |
||
388 | if (false !== $closure($this->createSubCrawler($node), $i)) { |
||
389 | $nodes[] = $node; |
||
390 | } |
||
391 | } |
||
392 | |||
393 | return $this->createSubCrawler($nodes); |
||
394 | } |
||
395 | |||
396 | /** |
||
397 | * Returns the first node of the current selection. |
||
398 | * |
||
399 | * @return self |
||
400 | */ |
||
401 | public function first() |
||
402 | { |
||
403 | return $this->eq(0); |
||
404 | } |
||
405 | |||
406 | /** |
||
407 | * Returns the last node of the current selection. |
||
408 | * |
||
409 | * @return self |
||
410 | */ |
||
411 | public function last() |
||
412 | { |
||
413 | return $this->eq(count($this) - 1); |
||
414 | } |
||
415 | |||
416 | /** |
||
417 | * Returns the siblings nodes of the current selection. |
||
418 | * |
||
419 | * @return self |
||
420 | * |
||
421 | * @throws \InvalidArgumentException When current node is empty |
||
422 | */ |
||
423 | public function siblings() |
||
424 | { |
||
425 | if (!count($this)) { |
||
426 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
427 | } |
||
428 | |||
429 | return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); |
||
430 | } |
||
431 | |||
432 | /** |
||
433 | * Returns the next siblings nodes of the current selection. |
||
434 | * |
||
435 | * @return self |
||
436 | * |
||
437 | * @throws \InvalidArgumentException When current node is empty |
||
438 | */ |
||
439 | public function nextAll() |
||
440 | { |
||
441 | if (!count($this)) { |
||
442 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
443 | } |
||
444 | |||
445 | return $this->createSubCrawler($this->sibling($this->getNode(0))); |
||
446 | } |
||
447 | |||
448 | /** |
||
449 | * Returns the previous sibling nodes of the current selection. |
||
450 | * |
||
451 | * @return self |
||
452 | * |
||
453 | * @throws \InvalidArgumentException |
||
454 | */ |
||
455 | public function previousAll() |
||
456 | { |
||
457 | if (!count($this)) { |
||
458 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
459 | } |
||
460 | |||
461 | return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); |
||
462 | } |
||
463 | |||
464 | /** |
||
465 | * Returns the parents nodes of the current selection. |
||
466 | * |
||
467 | * @return self |
||
468 | * |
||
469 | * @throws \InvalidArgumentException When current node is empty |
||
470 | */ |
||
471 | public function parents() |
||
472 | { |
||
473 | if (!count($this)) { |
||
474 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
475 | } |
||
476 | |||
477 | $node = $this->getNode(0); |
||
478 | $nodes = array(); |
||
479 | |||
480 | while ($node = $node->parentNode) { |
||
481 | if (XML_ELEMENT_NODE === $node->nodeType) { |
||
482 | $nodes[] = $node; |
||
483 | } |
||
484 | } |
||
485 | |||
486 | return $this->createSubCrawler($nodes); |
||
487 | } |
||
488 | |||
489 | /** |
||
490 | * Returns the children nodes of the current selection. |
||
491 | * |
||
492 | * @return self |
||
493 | * |
||
494 | * @throws \InvalidArgumentException When current node is empty |
||
495 | */ |
||
496 | public function children() |
||
497 | { |
||
498 | if (!count($this)) { |
||
499 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
500 | } |
||
501 | |||
502 | $node = $this->getNode(0)->firstChild; |
||
503 | |||
504 | return $this->createSubCrawler($node ? $this->sibling($node) : array()); |
||
505 | } |
||
506 | |||
507 | /** |
||
508 | * Returns the attribute value of the first node of the list. |
||
509 | * |
||
510 | * @param string $attribute The attribute name |
||
511 | * |
||
512 | * @return string|null The attribute value or null if the attribute does not exist |
||
513 | * |
||
514 | * @throws \InvalidArgumentException When current node is empty |
||
515 | */ |
||
516 | public function attr($attribute) |
||
517 | { |
||
518 | if (!count($this)) { |
||
519 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
520 | } |
||
521 | |||
522 | $node = $this->getNode(0); |
||
523 | |||
524 | return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null; |
||
525 | } |
||
526 | |||
527 | /** |
||
528 | * Returns the node name of the first node of the list. |
||
529 | * |
||
530 | * @return string The node name |
||
531 | * |
||
532 | * @throws \InvalidArgumentException When current node is empty |
||
533 | */ |
||
534 | public function nodeName() |
||
535 | { |
||
536 | if (!count($this)) { |
||
537 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
538 | } |
||
539 | |||
540 | return $this->getNode(0)->nodeName; |
||
541 | } |
||
542 | |||
543 | /** |
||
544 | * Returns the node value of the first node of the list. |
||
545 | * |
||
546 | * @return string The node value |
||
547 | * |
||
548 | * @throws \InvalidArgumentException When current node is empty |
||
549 | */ |
||
550 | public function text() |
||
551 | { |
||
552 | if (!count($this)) { |
||
553 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
554 | } |
||
555 | |||
556 | return $this->getNode(0)->nodeValue; |
||
557 | } |
||
558 | |||
559 | /** |
||
560 | * Returns the first node of the list as HTML. |
||
561 | * |
||
562 | * @return string The node html |
||
563 | * |
||
564 | * @throws \InvalidArgumentException When current node is empty |
||
565 | */ |
||
566 | public function html() |
||
567 | { |
||
568 | if (!count($this)) { |
||
569 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
570 | } |
||
571 | |||
572 | $html = ''; |
||
573 | foreach ($this->getNode(0)->childNodes as $child) { |
||
574 | $html .= $child->ownerDocument->saveHTML($child); |
||
575 | } |
||
576 | |||
577 | return $html; |
||
578 | } |
||
579 | |||
580 | /** |
||
581 | * Extracts information from the list of nodes. |
||
582 | * |
||
583 | * You can extract attributes or/and the node value (_text). |
||
584 | * |
||
585 | * Example: |
||
586 | * |
||
587 | * $crawler->filter('h1 a')->extract(array('_text', 'href')); |
||
588 | * |
||
589 | * @param array $attributes An array of attributes |
||
590 | * |
||
591 | * @return array An array of extracted values |
||
592 | */ |
||
593 | public function extract($attributes) |
||
594 | { |
||
595 | $attributes = (array) $attributes; |
||
596 | $count = count($attributes); |
||
597 | |||
598 | $data = array(); |
||
599 | foreach ($this as $node) { |
||
600 | $elements = array(); |
||
601 | foreach ($attributes as $attribute) { |
||
602 | if ('_text' === $attribute) { |
||
603 | $elements[] = $node->nodeValue; |
||
604 | } else { |
||
605 | $elements[] = $node->getAttribute($attribute); |
||
606 | } |
||
607 | } |
||
608 | |||
609 | $data[] = $count > 1 ? $elements : $elements[0]; |
||
610 | } |
||
611 | |||
612 | return $data; |
||
613 | } |
||
614 | |||
615 | /** |
||
616 | * Filters the list of nodes with an XPath expression. |
||
617 | * |
||
618 | * The XPath expression is evaluated in the context of the crawler, which |
||
619 | * is considered as a fake parent of the elements inside it. |
||
620 | * This means that a child selector "div" or "./div" will match only |
||
621 | * the div elements of the current crawler, not their children. |
||
622 | * |
||
623 | * @param string $xpath An XPath expression |
||
624 | * |
||
625 | * @return self |
||
626 | */ |
||
627 | public function filterXPath($xpath) |
||
628 | { |
||
629 | $xpath = $this->relativize($xpath); |
||
630 | |||
631 | // If we dropped all expressions in the XPath while preparing it, there would be no match |
||
632 | if ('' === $xpath) { |
||
633 | return $this->createSubCrawler(null); |
||
634 | } |
||
635 | |||
636 | return $this->filterRelativeXPath($xpath); |
||
637 | } |
||
638 | |||
639 | /** |
||
640 | * Filters the list of nodes with a CSS selector. |
||
641 | * |
||
642 | * This method only works if you have installed the CssSelector Symfony Component. |
||
643 | * |
||
644 | * @param string $selector A CSS selector |
||
645 | * |
||
646 | * @return self |
||
647 | * |
||
648 | * @throws \RuntimeException if the CssSelector Component is not available |
||
649 | */ |
||
650 | public function filter($selector) |
||
651 | { |
||
652 | if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) { |
||
653 | throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).'); |
||
654 | } |
||
655 | |||
656 | $converter = new CssSelectorConverter($this->isHtml); |
||
657 | |||
658 | // The CssSelector already prefixes the selector with descendant-or-self:: |
||
659 | return $this->filterRelativeXPath($converter->toXPath($selector)); |
||
660 | } |
||
661 | |||
662 | /** |
||
663 | * Selects links by name or alt value for clickable images. |
||
664 | * |
||
665 | * @param string $value The link text |
||
666 | * |
||
667 | * @return self |
||
668 | */ |
||
669 | public function selectLink($value) |
||
670 | { |
||
671 | $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). |
||
672 | sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); |
||
673 | |||
674 | return $this->filterRelativeXPath($xpath); |
||
675 | } |
||
676 | |||
677 | /** |
||
678 | * Selects a button by name or alt value for images. |
||
679 | * |
||
680 | * @param string $value The button text |
||
681 | * |
||
682 | * @return self |
||
683 | */ |
||
684 | public function selectButton($value) |
||
685 | { |
||
686 | $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; |
||
687 | $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')). |
||
688 | sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)). |
||
689 | sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)); |
||
690 | |||
691 | return $this->filterRelativeXPath($xpath); |
||
692 | } |
||
693 | |||
694 | /** |
||
695 | * Returns a Link object for the first node in the list. |
||
696 | * |
||
697 | * @param string $method The method for the link (get by default) |
||
698 | * |
||
699 | * @return Link A Link instance |
||
700 | * |
||
701 | * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement |
||
702 | */ |
||
703 | public function link($method = 'get') |
||
704 | { |
||
705 | if (!count($this)) { |
||
706 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
707 | } |
||
708 | |||
709 | $node = $this->getNode(0); |
||
710 | |||
711 | if (!$node instanceof \DOMElement) { |
||
712 | throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); |
||
713 | } |
||
714 | |||
715 | return new Link($node, $this->baseHref, $method); |
||
716 | } |
||
717 | |||
718 | /** |
||
719 | * Returns an array of Link objects for the nodes in the list. |
||
720 | * |
||
721 | * @return Link[] An array of Link instances |
||
722 | * |
||
723 | * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances |
||
724 | */ |
||
725 | public function links() |
||
726 | { |
||
727 | $links = array(); |
||
728 | foreach ($this as $node) { |
||
729 | if (!$node instanceof \DOMElement) { |
||
730 | throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); |
||
731 | } |
||
732 | |||
733 | $links[] = new Link($node, $this->baseHref, 'get'); |
||
734 | } |
||
735 | |||
736 | return $links; |
||
737 | } |
||
738 | |||
739 | /** |
||
740 | * Returns a Form object for the first node in the list. |
||
741 | * |
||
742 | * @param array $values An array of values for the form fields |
||
743 | * @param string $method The method for the form |
||
744 | * |
||
745 | * @return Form A Form instance |
||
746 | * |
||
747 | * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement |
||
748 | */ |
||
749 | public function form(array $values = null, $method = null) |
||
750 | { |
||
751 | if (!count($this)) { |
||
752 | throw new \InvalidArgumentException('The current node list is empty.'); |
||
753 | } |
||
754 | |||
755 | $node = $this->getNode(0); |
||
756 | |||
757 | if (!$node instanceof \DOMElement) { |
||
758 | throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); |
||
759 | } |
||
760 | |||
761 | $form = new Form($node, $this->uri, $method, $this->baseHref); |
||
762 | |||
763 | if (null !== $values) { |
||
764 | $form->setValues($values); |
||
765 | } |
||
766 | |||
767 | return $form; |
||
768 | } |
||
769 | |||
770 | /** |
||
771 | * Overloads a default namespace prefix to be used with XPath and CSS expressions. |
||
772 | * |
||
773 | * @param string $prefix |
||
774 | */ |
||
775 | public function setDefaultNamespacePrefix($prefix) |
||
776 | { |
||
777 | $this->defaultNamespacePrefix = $prefix; |
||
778 | } |
||
779 | |||
780 | /** |
||
781 | * @param string $prefix |
||
782 | * @param string $namespace |
||
783 | */ |
||
784 | public function registerNamespace($prefix, $namespace) |
||
785 | { |
||
786 | $this->namespaces[$prefix] = $namespace; |
||
787 | } |
||
788 | |||
789 | /** |
||
790 | * Converts string for XPath expressions. |
||
791 | * |
||
792 | * Escaped characters are: quotes (") and apostrophe ('). |
||
793 | * |
||
794 | * Examples: |
||
795 | * <code> |
||
796 | * echo Crawler::xpathLiteral('foo " bar'); |
||
797 | * //prints 'foo " bar' |
||
798 | * |
||
799 | * echo Crawler::xpathLiteral("foo ' bar"); |
||
800 | * //prints "foo ' bar" |
||
801 | * |
||
802 | * echo Crawler::xpathLiteral('a\'b"c'); |
||
803 | * //prints concat('a', "'", 'b"c') |
||
804 | * </code> |
||
805 | * |
||
806 | * @param string $s String to be escaped |
||
807 | * |
||
808 | * @return string Converted string |
||
809 | */ |
||
810 | public static function xpathLiteral($s) |
||
811 | { |
||
812 | if (false === strpos($s, "'")) { |
||
813 | return sprintf("'%s'", $s); |
||
814 | } |
||
815 | |||
816 | if (false === strpos($s, '"')) { |
||
817 | return sprintf('"%s"', $s); |
||
818 | } |
||
819 | |||
820 | $string = $s; |
||
821 | $parts = array(); |
||
822 | while (true) { |
||
823 | if (false !== $pos = strpos($string, "'")) { |
||
824 | $parts[] = sprintf("'%s'", substr($string, 0, $pos)); |
||
825 | $parts[] = "\"'\""; |
||
826 | $string = substr($string, $pos + 1); |
||
827 | } else { |
||
828 | $parts[] = "'$string'"; |
||
829 | break; |
||
830 | } |
||
831 | } |
||
832 | |||
833 | return sprintf('concat(%s)', implode(', ', $parts)); |
||
834 | } |
||
835 | |||
836 | /** |
||
837 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
838 | */ |
||
839 | public function attach($object, $data = null) |
||
840 | { |
||
841 | $this->triggerDeprecation(__METHOD__); |
||
842 | |||
843 | parent::attach($object, $data); |
||
844 | } |
||
845 | |||
846 | /** |
||
847 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
848 | */ |
||
849 | public function detach($object) |
||
850 | { |
||
851 | $this->triggerDeprecation(__METHOD__); |
||
852 | |||
853 | parent::detach($object); |
||
854 | } |
||
855 | |||
856 | /** |
||
857 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
858 | */ |
||
859 | public function contains($object) |
||
860 | { |
||
861 | $this->triggerDeprecation(__METHOD__); |
||
862 | |||
863 | return parent::contains($object); |
||
864 | } |
||
865 | |||
866 | /** |
||
867 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
868 | */ |
||
869 | public function addAll($storage) |
||
870 | { |
||
871 | $this->triggerDeprecation(__METHOD__); |
||
872 | |||
873 | parent::addAll($storage); |
||
874 | } |
||
875 | |||
876 | /** |
||
877 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
878 | */ |
||
879 | public function removeAll($storage) |
||
880 | { |
||
881 | $this->triggerDeprecation(__METHOD__); |
||
882 | |||
883 | parent::removeAll($storage); |
||
884 | } |
||
885 | |||
886 | /** |
||
887 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
888 | */ |
||
889 | public function removeAllExcept($storage) |
||
890 | { |
||
891 | $this->triggerDeprecation(__METHOD__); |
||
892 | |||
893 | parent::removeAllExcept($storage); |
||
894 | } |
||
895 | |||
896 | /** |
||
897 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
898 | */ |
||
899 | public function getInfo() |
||
900 | { |
||
901 | $this->triggerDeprecation(__METHOD__); |
||
902 | |||
903 | return parent::getInfo(); |
||
904 | } |
||
905 | |||
906 | /** |
||
907 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
908 | */ |
||
909 | public function setInfo($data) |
||
910 | { |
||
911 | $this->triggerDeprecation(__METHOD__); |
||
912 | |||
913 | parent::setInfo($data); |
||
914 | } |
||
915 | |||
916 | /** |
||
917 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
918 | */ |
||
919 | public function offsetExists($object) |
||
920 | { |
||
921 | $this->triggerDeprecation(__METHOD__); |
||
922 | |||
923 | return parent::offsetExists($object); |
||
924 | } |
||
925 | |||
926 | /** |
||
927 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
928 | */ |
||
929 | public function offsetSet($object, $data = null) |
||
930 | { |
||
931 | $this->triggerDeprecation(__METHOD__); |
||
932 | |||
933 | parent::offsetSet($object, $data); |
||
934 | } |
||
935 | |||
936 | /** |
||
937 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
938 | */ |
||
939 | public function offsetUnset($object) |
||
940 | { |
||
941 | $this->triggerDeprecation(__METHOD__); |
||
942 | |||
943 | parent::offsetUnset($object); |
||
944 | } |
||
945 | |||
946 | /** |
||
947 | * @deprecated Using the SplObjectStorage API on the Crawler is deprecated as of 2.8 and will be removed in 3.0. |
||
948 | */ |
||
949 | public function offsetGet($object) |
||
950 | { |
||
951 | $this->triggerDeprecation(__METHOD__); |
||
952 | |||
953 | return parent::offsetGet($object); |
||
954 | } |
||
955 | |||
956 | /** |
||
957 | * Filters the list of nodes with an XPath expression. |
||
958 | * |
||
959 | * The XPath expression should already be processed to apply it in the context of each node. |
||
960 | * |
||
961 | * @param string $xpath |
||
962 | * |
||
963 | * @return self |
||
964 | */ |
||
965 | private function filterRelativeXPath($xpath) |
||
966 | { |
||
967 | $prefixes = $this->findNamespacePrefixes($xpath); |
||
968 | |||
969 | $crawler = $this->createSubCrawler(null); |
||
970 | |||
971 | foreach ($this as $node) { |
||
972 | $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes); |
||
973 | $crawler->add($domxpath->query($xpath, $node)); |
||
974 | } |
||
975 | |||
976 | return $crawler; |
||
977 | } |
||
978 | |||
979 | /** |
||
980 | * Make the XPath relative to the current context. |
||
981 | * |
||
982 | * The returned XPath will match elements matching the XPath inside the current crawler |
||
983 | * when running in the context of a node of the crawler. |
||
984 | * |
||
985 | * @param string $xpath |
||
986 | * |
||
987 | * @return string |
||
988 | */ |
||
989 | private function relativize($xpath) |
||
990 | { |
||
991 | $expressions = array(); |
||
992 | |||
993 | // An expression which will never match to replace expressions which cannot match in the crawler |
||
994 | // We cannot simply drop |
||
995 | $nonMatchingExpression = 'a[name() = "b"]'; |
||
996 | |||
997 | $xpathLen = strlen($xpath); |
||
998 | $openedBrackets = 0; |
||
999 | $startPosition = strspn($xpath, " \t\n\r\0\x0B"); |
||
1000 | |||
1001 | for ($i = $startPosition; $i <= $xpathLen; ++$i) { |
||
1002 | $i += strcspn($xpath, '"\'[]|', $i); |
||
1003 | |||
1004 | if ($i < $xpathLen) { |
||
1005 | switch ($xpath[$i]) { |
||
1006 | case '"': |
||
1007 | case "'": |
||
1008 | if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { |
||
1009 | return $xpath; // The XPath expression is invalid |
||
1010 | } |
||
1011 | continue 2; |
||
1012 | case '[': |
||
1013 | ++$openedBrackets; |
||
1014 | continue 2; |
||
1015 | case ']': |
||
1016 | --$openedBrackets; |
||
1017 | continue 2; |
||
1018 | } |
||
1019 | } |
||
1020 | if ($openedBrackets) { |
||
1021 | continue; |
||
1022 | } |
||
1023 | |||
1024 | if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { |
||
1025 | // If the union is inside some braces, we need to preserve the opening braces and apply |
||
1026 | // the change only inside it. |
||
1027 | $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); |
||
1028 | $parenthesis = substr($xpath, $startPosition, $j); |
||
1029 | $startPosition += $j; |
||
1030 | } else { |
||
1031 | $parenthesis = ''; |
||
1032 | } |
||
1033 | $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); |
||
1034 | |||
1035 | // BC for Symfony 2.4 and lower were elements were adding in a fake _root parent |
||
1036 | if (0 === strpos($expression, '/_root/')) { |
||
1037 | @trigger_error('XPath expressions referencing the fake root node are deprecated since version 2.8 and will be unsupported in 3.0. Please use "./" instead of "/_root/".', E_USER_DEPRECATED); |
||
1038 | |||
1039 | $expression = './'.substr($expression, 7); |
||
1040 | } elseif (0 === strpos($expression, 'self::*/')) { |
||
1041 | $expression = './'.substr($expression, 8); |
||
1042 | } |
||
1043 | |||
1044 | // add prefix before absolute element selector |
||
1045 | if ('' === $expression) { |
||
1046 | $expression = $nonMatchingExpression; |
||
1047 | } elseif (0 === strpos($expression, '//')) { |
||
1048 | $expression = 'descendant-or-self::'.substr($expression, 2); |
||
1049 | } elseif (0 === strpos($expression, './/')) { |
||
1050 | $expression = 'descendant-or-self::'.substr($expression, 3); |
||
1051 | } elseif (0 === strpos($expression, './')) { |
||
1052 | $expression = 'self::'.substr($expression, 2); |
||
1053 | } elseif (0 === strpos($expression, 'child::')) { |
||
1054 | $expression = 'self::'.substr($expression, 7); |
||
1055 | } elseif ('/' === $expression[0] || 0 === strpos($expression, 'self::')) { |
||
1056 | // the only direct child in Symfony 2.4 and lower is _root, which is already handled previously |
||
1057 | // so let's drop the expression entirely |
||
1058 | $expression = $nonMatchingExpression; |
||
1059 | } elseif ('.' === $expression[0]) { |
||
1060 | // '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results |
||
1061 | $expression = $nonMatchingExpression; |
||
1062 | } elseif (0 === strpos($expression, 'descendant::')) { |
||
1063 | $expression = 'descendant-or-self::'.substr($expression, 12); |
||
1064 | } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { |
||
1065 | // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) |
||
1066 | $expression = $nonMatchingExpression; |
||
1067 | } elseif (0 !== strpos($expression, 'descendant-or-self::')) { |
||
1068 | $expression = 'self::'.$expression; |
||
1069 | } |
||
1070 | $expressions[] = $parenthesis.$expression; |
||
1071 | |||
1072 | if ($i === $xpathLen) { |
||
1073 | return implode(' | ', $expressions); |
||
1074 | } |
||
1075 | |||
1076 | $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); |
||
1077 | $startPosition = $i + 1; |
||
1078 | } |
||
1079 | |||
1080 | return $xpath; // The XPath expression is invalid |
||
1081 | } |
||
1082 | |||
1083 | /** |
||
1084 | * @param int $position |
||
1085 | * |
||
1086 | * @return \DOMElement|null |
||
1087 | */ |
||
1088 | public function getNode($position) |
||
1089 | { |
||
1090 | foreach ($this as $i => $node) { |
||
1091 | if ($i == $position) { |
||
1092 | return $node; |
||
1093 | } |
||
1094 | } |
||
1095 | } |
||
1096 | |||
1097 | /** |
||
1098 | * @param \DOMElement $node |
||
1099 | * @param string $siblingDir |
||
1100 | * |
||
1101 | * @return array |
||
1102 | */ |
||
1103 | protected function sibling($node, $siblingDir = 'nextSibling') |
||
1104 | { |
||
1105 | $nodes = array(); |
||
1106 | |||
1107 | do { |
||
1108 | if ($node !== $this->getNode(0) && $node->nodeType === 1) { |
||
1109 | $nodes[] = $node; |
||
1110 | } |
||
1111 | } while ($node = $node->$siblingDir); |
||
1112 | |||
1113 | return $nodes; |
||
1114 | } |
||
1115 | |||
1116 | /** |
||
1117 | * @param \DOMDocument $document |
||
1118 | * @param array $prefixes |
||
1119 | * |
||
1120 | * @return \DOMXPath |
||
1121 | * |
||
1122 | * @throws \InvalidArgumentException |
||
1123 | */ |
||
1124 | private function createDOMXPath(\DOMDocument $document, array $prefixes = array()) |
||
1125 | { |
||
1126 | $domxpath = new \DOMXPath($document); |
||
1127 | |||
1128 | foreach ($prefixes as $prefix) { |
||
1129 | $namespace = $this->discoverNamespace($domxpath, $prefix); |
||
1130 | if (null !== $namespace) { |
||
1131 | $domxpath->registerNamespace($prefix, $namespace); |
||
1132 | } |
||
1133 | } |
||
1134 | |||
1135 | return $domxpath; |
||
1136 | } |
||
1137 | |||
1138 | /** |
||
1139 | * @param \DOMXPath $domxpath |
||
1140 | * @param string $prefix |
||
1141 | * |
||
1142 | * @return string |
||
1143 | * |
||
1144 | * @throws \InvalidArgumentException |
||
1145 | */ |
||
1146 | private function discoverNamespace(\DOMXPath $domxpath, $prefix) |
||
1147 | { |
||
1148 | if (isset($this->namespaces[$prefix])) { |
||
1149 | return $this->namespaces[$prefix]; |
||
1150 | } |
||
1151 | |||
1152 | // ask for one namespace, otherwise we'd get a collection with an item for each node |
||
1153 | $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); |
||
1154 | |||
1155 | if ($node = $namespaces->item(0)) { |
||
1156 | return $node->nodeValue; |
||
1157 | } |
||
1158 | } |
||
1159 | |||
1160 | /** |
||
1161 | * @param string $xpath |
||
1162 | * |
||
1163 | * @return array |
||
1164 | */ |
||
1165 | private function findNamespacePrefixes($xpath) |
||
1166 | { |
||
1167 | if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { |
||
1168 | return array_unique($matches['prefix']); |
||
1169 | } |
||
1170 | |||
1171 | return array(); |
||
1172 | } |
||
1173 | |||
1174 | /** |
||
1175 | * Creates a crawler for some subnodes. |
||
1176 | * |
||
1177 | * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes |
||
1178 | * |
||
1179 | * @return static |
||
1180 | */ |
||
1181 | private function createSubCrawler($nodes) |
||
1182 | { |
||
1183 | $crawler = new static($nodes, $this->uri, $this->baseHref); |
||
1184 | $crawler->isHtml = $this->isHtml; |
||
1185 | $crawler->document = $this->document; |
||
1186 | $crawler->namespaces = $this->namespaces; |
||
1187 | |||
1188 | return $crawler; |
||
1189 | } |
||
1190 | |||
1191 | private function triggerDeprecation($methodName, $useTrace = false) |
||
1192 | { |
||
1193 | if ($useTrace || defined('HHVM_VERSION')) { |
||
1194 | if (PHP_VERSION_ID >= 50400) { |
||
1195 | $trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 3); |
||
1196 | } else { |
||
1197 | $trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS); |
||
1198 | } |
||
1199 | |||
1200 | // The SplObjectStorage class performs calls to its own methods. These |
||
1201 | // method calls must not lead to triggered deprecation notices. |
||
1202 | if (isset($trace[2]['class']) && 'SplObjectStorage' === $trace[2]['class']) { |
||
1203 | return; |
||
1204 | } |
||
1205 | } |
||
1206 | |||
1207 | @trigger_error('The '.$methodName.' method is deprecated since version 2.8 and will be removed in 3.0.', E_USER_DEPRECATED); |
||
1208 | } |
||
1209 | } |