正在加载...

网页解析4 - 分析网页内容(中)

上篇讲了htmlhandler。本篇讲htmlparser。各位看管应该等着急了。

不罗嗦,直接上代码:

html_parser.py

#coding=utf-8
import re

class HtmlParser(object):

    def __init__(self, handler, mask=None):
        self.handler = handler
        self.mask = mask

        self.handle_text = handler.handle_text
        self.handle_comment = handler.handle_comment
        self.is_end = False
        self.tag_name = ''

        self.replace_dict = {
          ' ': ' ',
          '&': '&',
          '&lt;': '<',
          '&gt;': '>',
          '&brvbar;': '?',
          '&quot;': '"',
          '&middot;': '?',
          '&bull;': '?',
          '\n': ' ',
          '\r': ' '
        }
        self.rx = re.compile('|'.join(map(re.escape, self.replace_dict)))

    def parse(self, html):
        self.handler.clear_memory()
        self.parse_html(html)
        self.handler.finish()

    def parse_html(self, html):
        if html is None:
            return

        current_tag_begin, last_tag_end = 0, 0

        while last_tag_end != -1 and last_tag_end < len(html):
            current_tag_begin = html.find('<', last_tag_end)
            if current_tag_begin == -1:
                break

            self.text(html, last_tag_end, current_tag_begin);

            p2 = self.comment(html, current_tag_begin)  # 看是不是注释
            if (p2 > 0): # 是注释
                last_tag_end = p2
                continue

            # 当'<'后面不是注释的时候,走下面的代码片断
            last_tag_end = self.check_is_end(html, current_tag_begin + 1);
            if last_tag_end == -1:  # 表示已经结束了。
                break

            last_tag_end = self.get_tag_name_position(html, last_tag_end)
            if last_tag_end < 0: # 可能是无效字符,也可能是没有关键字
                last_tag_end = current_tag_begin + 1
                if self.handle_text:
                    self.handler.text("<")
                continue

            last_tag_end = self.find_tag_end(html, last_tag_end) # 找到了会返回位置
            if (last_tag_end != -1):  # 找到
                last_tag_end += 1
                if not self.in_mask():
                    continue

                str = html[current_tag_begin:last_tag_end]
                if self.isEnd:
                    self.handler.end(self.keyWord, str, current_tag_begin)
                else:
                    self.handler.start(self.keyWord, str, current_tag_begin)
                    if not ('a' == self.keyWord or 'A' == self.tag_name) and self.is_self_closed(str):
                        self.handler.end(self.keyWord, str, current_tag_begin)

        if last_tag_end == -1:
            last_tag_end = current_tag_begin

        self.text(html, last_tag_end, len(html))

    def is_self_closed(self, html):
        for i in range(len(html) - 2, 0, -1):
            c = html[i]
            if not c.isspace():
                if c == '/':
                    return True
                return False

        return False

    def text(self, html, last_tag_end, current_tag_begin):
        if not self.handle_text:
            return

        begin, end = -1, current_tag_begin

        for i in range(last_tag_end, current_tag_begin):
            c = html[i] # 找到第一个不是空格的位置
            if not c.isspace():
                begin = i
                break
        if begin == -1:
            return # 全部是空格,返回

        if current_tag_begin > 0:
            for i in range(current_tag_begin - 1, begin, -1):
                if not html[i].isspace():
                    end = i + 1
                    break

        self.handler.text(self.html_decode(html[begin:end]))

    def html_decode(self, html):
        return self.rx.sub(lambda match: self.replace_dict[match.group(0)], html)

    def comment(self, html, p):
        if html.startswith("<!--", p):
            p1 = html.find("-->", p + 2)
            if p1 == -1:
                return -1
            p1 += 3
            if self.handle_comment:
                self.handler.comment(html[p:p1], p)
            return p1

        return -1

    def in_mask(self):
        if self.keyWord is None:
            return False
        if self.mask is None:
            return True

        for x in self.mask:
            if self.keyWord == x.upper() or self.tag_name == x.lower():
                return True
        return False

    def find_tag_end(self, html, p):
        if p == -1 or p >= len(html):
            return -1
        match = '>'
        lastIsEqual = False

        for i in range(p, len(html)):
            c = html[i]
            if c == match:
                if match == '>':
                    return i
                if match == "'" or match == '"':
                    match = '>'
            elif (c == "'" or c == '"') and lastIsEqual:
                if match == '>':
                    match = c

            if c == '=':
                lastIsEqual = True
            elif not c.isspace():
                lastIsEqual = False

        return -1

    def get_tag_name_position(self, html, p):
        self.keyWord = None
        if p == -1 or p >= len(html):
            return -1

        for i in range(p, len(html)):
            c = html[i]

            if self.is_tag_stop_flag(c):
                self.keyWord = html[p:i].lower()
                return i;  #返回关键词的节数位置

            if not self.is_tag_name_char(c, i - p):
                return -2 # 如果是错误字符(非有效字符)。
        return -1 # 表示没有关键字

    def is_tag_name_char(self, ch, i):
        if (ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z'):
            return True
        if ch == '_' or ch == ':' or ch == '.':
            return True
        if i > 0 and ch >= '0' and ch <= '9':
            return True
        if i == 0 and ch == '!':
            return True
        return False

    def check_is_end(self, html, p):
        if p >= len(html):
            return -1

        for i in range(p, len(html)):
            c= html[i]
            if not c.isspace():
                if (c == '/'):
                    self.isEnd = True
                    return i + 1
                else:
                    self.isEnd = False
                    return i
        return -1

    def is_tag_stop_flag(self, c):
        if c.isspace() or c == '>' or c == '/':
            return True
        return False

测试代码,看下篇。