正在加载...

网页解析2 - 解析标签

分析网页内容的工具有很多。为了动手解决实际问题,让代码便得可控性。我重复造就了轮子。

这个部分讲如何进行网页里面标签块的解析。

例如:

<a href='#' class="anavy">

解析出里面的href 和 class 分别是什么。 作用就是这样。但是要做到下面的几个方面

  1. 可以检查标签是否可见,这个有利于以后的html分析,从而知道这个内容是否要显示出来
  2. 支持不规范的标签,比方说只有开,没有闭;少一个空格 等等

下面不多说,直接上代码:

html_util.py ``` #coding=utf-8

class AttributeUtil(object):

    @classmethod
    def get_attribute(cls, html_string, attribute_name):
        if html_string is None:
            return None

        attribute_name = attribute_name.lower()

        p = cls.__find_attribute_name(html_string, attribute_name)
        if p == -1:
            return None

        p = cls.__find_char_pos(html_string, p + len(attribute_name))
        if p == -1:
            return ''

        c = html_string[p]
        if c != '=':
            return ''

        p = cls.__find_char_pos(html_string, p + 1)
        if p == -1:
            return ''

        c = html_string[p]
        if c == "'" or c == '"':
            p1 = html_string.find(c, p + 1)
            if p1 == -1:
                return ''
            return cls.__clear_return(html_string[p + 1:p1])

        p1 = cls.__find_blank_pos(html_string, p + 1)
        if p1 == -1:
            return html_string[p:].strip()

        return cls.__clear_return(html_string[p:p1].strip())


    @classmethod
    def __clear_return(cls, str):
        if str is None:
            return None

        return str.replace('\n', '').replace('\r', '')

    @classmethod
    def __find_blank_pos(cls, html_string, p):

        for i in range(p, len(html_string)):
            c = html_string[i]
            if c.isspace() or c == '/' and i == len(html_string) - 2 or c == '>' and i == len(html_string) - 1:
                return i 

        return -1

    @classmethod
    def __find_char_pos(cls, html_string, p):
        if p < 0:
            return -1

        for i in range(p, len(html_string)):
            if not html_string[i].isspace():
                return i
        return -1

    @classmethod
    def __find_attribute_name(cls, html_string, attribute_name):
        p = 0
        tag_string_low = html_string.lower()
        attribute_name_length = len(attribute_name)
        while True:
            p = tag_string_low.find(attribute_name, p)
            if p == -1:
                break
            if p > 0:
                c = html_string[p - 1]

                if c.isspace() or c == '"' or c == "'":
                    if p + attribute_name_length == len(html_string):
                        return p
                    c = html_string[p + attribute_name_length]
                    if c.isspace() or c == '=':
                        return p
                p = p + attribute_name_length

            if p == 0:
                c = html_string[p + attribute_name_length]
                if c.isspace() or c == '=':
                    return p
                p += attribute_name_length

        return -1

    @classmethod
    def get_value_in_url(url, key):
        if url is None:
            return None

        p = url.find('?')
        if p == -1:
            return None

        lower_key = key.lower()
        params = url[p + 1:].split('&')
        for param in params:
            k, v = param.split('=')
            if k.lower() == lower_key:
                return v

        return None

    @classmethod
    def style_visibility(cls, value):
        """
        查看是否可见,
        @return: 如果可见返回True,否则返回False
        """
        style = cls.get_attribute(value, 'style')
        if style is None:
            return True
        style = style.lower()
        p = style.find('visibility');
        if p == -1:
            return True

        p1 = style.find(':', p)
        if p1 == -1:
            return True

        p2 = style.find(';', p1)

        v = style[p1 + 1:] if p2 == -1 else style[p1 + 1:p2]

        if v == None:
            return True

        v = v.strip()

        if v == 'hidden' or v == 'false':
            return False
        return True

测试代码如下: testhtmlutil.py

```` #coding=utf-8

import unittest
from html_util import AttributeUtil


class AttributeAnalyzerTest(unittest.TestCase):

    def test_style_visibility(self):
        tag_string = "<DIV ID=\"investInfo\" STYLE=\"position:absolute; width:106px; z-index:7; visibility: hiden\" onMouseOver=\"MM_showHideLayers('newsCenter','','hide','dataCenter','','hide','viewGovernment','','hide','chengxinCenter','','hide','workHall','','hide','superviseAppeal','','hide','participateGovern','','hide','investInfo','','show','serviceGuide','','hide')\"  onMouseOut=\"MM_showHideLayers('newsCenter','','hide','dataCenter','','hide','viewGovernment','','hide','chengxinCenter','','hide','workHall','','hide','superviseAppeal','','hide','participateGovern','','hide','investInfo','','hide','serviceGuide','','hide')\">"
        visibility = AttributeUtil.style_visibility(tag_string)
        self.assertTrue(visibility)

    def test_get_simple_attribute(self):
        tag_string = "window.location.href='bbsShowTopic.aspx?BoardID=148&Page=182'"
        value = AttributeUtil.get_attribute(tag_string, "window.location.href")
        self.assertEquals(value, 'bbsShowTopic.aspx?BoardID=148&Page=182')

    def test_get_none_attribute(self):
        """
        There is an error. why the result value became #"? the last version is none
        """
        value = AttributeUtil.get_attribute("<a href=#\" class=\"anavy\">", "href")
        self.assertEquals(value, '#"')

    def test_get_attribute_from_no_standrand_format_html(self):
        value = AttributeUtil.get_attribute("<sdgsadg witht=\"123\"src=ddsg", "src")
        self.assertEquals(value, 'ddsg')

    def test_get_empty_attribute_from_no_standrand_format_html(self):
        value = AttributeUtil.get_attribute("<sdgsadg src='", "src")
        self.assertEquals('', value)

    def test_get_upper_attribute(self):
        value = AttributeUtil.get_attribute("<sdgsadg src='SD'", "src")
        self.assertEquals('SD', value)

    def test_get_no_standrand_attribute(self):
        html_string = "<A class=blue href = \"http://shanghai.sohu.com/\" target=_blank>"
        value = AttributeUtil.get_attribute(html_string, "href")
        self.assertEquals(value, 'http://shanghai.sohu.com/')

    def test_get_attribute_with_upper_attribute_name(self):
        value = AttributeUtil.get_attribute("<sdgsadg src='SD' <aa>", "SRC")
        self.assertEquals(value, 'SD')

    def test_get_none_attribute_2(self):
        value = AttributeUtil.get_attribute("<script>window.location='http://www.shang360.com'</script>", "a")
        self.assertTrue(value is None)

    def test_tuge_page_fetch(self):
        value = AttributeUtil.get_attribute("<script>window.location='http://www.shang360.com'</script>", "a")
        self.assertTrue(value is None)

    def test_img_tag_and_no_standrand_attribute(self):
        tag_string = '''<IMG alt=在纽约联合国总部安理会会议厅,中国常驻联合国代表李保东 (左二) 与其他一些国家与会代表交谈。 
src="http://photocdn.sohu.com/20100610/Img272688535.jpg" align=middle>'''
        value = AttributeUtil.get_attribute(tag_string, "src")
        self.assertEquals('http://photocdn.sohu.com/20100610/Img272688535.jpg', value)

if __name__ == '__main__':
    unittest.main()

````