понедельник, 27 декабря 2010 г.

Парсер простого HTML

Убирает не закрыте, не нужные теги и атрибуты. Выводит отчет.
Код - пример #1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
<?php
 
error_reporting(-1);
 
class MyHtmlTidy
{
    const
        TAG  = '<(?:"[^"]*"|\'[^\']*\'|[^\'">])*>',
        ATTR = '\w++\s*=\s*"[^"]++"|\w++\s*=\s*\'[^\']++\'|\w++\s*=\s*[^\s]++';
 
    private
        $_goodTags  = array('b', 'i', 'u', 's', 'p', 'a', 'img', 'br', 'hr'),
        $_selfClose = array('img', 'br', 'hr'),
        $_goodAttrs = array(
                      'a'   => array('href', 'title'),
                      'img' => array('src', 'alt')),
        $_nest      = array();
 
    public
        $errors = array();
 
    public function preparse($html)
    {
        $this->_nest = array();
        $this->errors = array();
        $text = preg_replace_callback('/('.self::TAG.')/Uus', array($this, '_replace'), $html);
        if (!empty($this->_nest)) {
            $this->errors[] = 'Unclosed tags ' . implode(', ', $this->_nest);
            $text .= '</' . implode('></', array_reverse($this->_nest)) . '>';
        }
        return $text;
    }
 
    private function _replace($matches)
    {
        $tag = $matches[1];
 
        preg_match('/^<\/?(\w++)/', $tag, $m);
        $tagName = strtolower($m[1]);
        $isSelfClosed = $tag{strlen($tag) - 2} == '/';
        $attrs = trim(substr($tag, strlen($m[0]), ($isSelfClosed ? -2 : -1)));
 
        if (!in_array($tagName, $this->_goodTags)) {
            $this->errors[] = 'Tag ' . $tagName . ' is deprecated';
            return '';
        }
 
        // Closing tag
        if ($tag{1} == '/') {
            if (empty($this->_nest) || end($this->_nest) != $tagName) {
                $this->errors[] = 'Odd close tag ' . $tagName;
                return '<' . $tagName . '></' . $tagName . '>';
            }
            array_pop($this->_nest);
            return '</' . $tagName . '>';
        }
 
        // Open tag or self-closing tag
        $isSelfClosed = $isSelfClosed || in_array($tagName, $this->_selfClose);
 
        if (!$isSelfClosed) {
            $this->_nest[] = $tagName;
        }                       
 
        if (!isset($this->_goodAttrs[$tagName])) {
            // No attributes at all
            if (strlen($attrs)) {
                $this->errors[] = 'Tag ' . $tagName . ' cannot have attributes';
            }
            $attrs = '';
        } else {
            // Check every attribute
            preg_match_all('/'.self::ATTR.'/Uus', $attrs, $m);
            $attrs = $m[0];
            foreach ($attrs as $i => $attr) {
                $p = strpos($attr, '=');
                $attrName = strtolower(trim(substr($attr, 0, $p)));
                if (!in_array($attrName, $this->_goodAttrs[$tagName])) {
                    $this->errors[] = 'Wrong ' . $tagName . ' attribute ' . $attrName;
                    unset($attrs[$i]);
                } else {
                    $attrs[$i] = $attrName . '=' . trim(substr($attr, $p + 1));
                }
            }
            $attrs = count($attrs) ? (' ' . implode(' ', $attrs)) : '';
        }
   
        return '<' . $tagName . $attrs . ($isSelfClosed ? '/>' : '>');
    }
}
 
$t = new MyHtmlTidy();
 
$html = <<<HTML
<p class='blabla'>dslkldsldslsd<br>
kjksdjsdk<a href="http://thesite.name/path" target="_new" title="ololo" onclick="javascript:doit('xxx')">djdkjdk</a>
<img src=0.gif alt='pysh-pysh'>
ds;lsd;; <b>skjskjsk kjdkjdkd
HTML;
 
header('Content-type: text/plain');
 
echo $html;
echo "\n===========================\n";
 
$preparsed = $t->preparse($html);
if (!empty($t->errors)) {
    echo implode("\n", $t->errors);
    echo "\n===========================\n";
}
echo $preparsed;
Выводит:
Код - пример #1
<p class='blabla'>dslkldsldslsd<br>
kjksdjsdk<a href="http://thesite.name/path" target="_new" title="ololo" onclick="javascript:doit('xxx')">djdkjdk</a>
<img src=0.gif alt='pysh-pysh'>
ds;lsd;; <b>skjskjsk kjdkjdkd
===========================
Tag p cannot have attributes
Wrong a attribute target
Wrong a attribute onclick
Unclosed tags p, b
===========================
<p>dslkldsldslsd<br/>
kjksdjsdk<a href="http://thesite.name/path" title="ololo">djdkjdk</a>
<img src=0.gif alt='pysh-pysh'/>
ds;lsd;; <b>skjskjsk kjdkjdkd</b></p>
Скопипаситл с пыхи

Комментариев нет:

Отправить комментарий