WebSVN – nexmon – Blame – Rev 1 – /utilities/wireshark/tools/html2text.py

1

office

1

#!/usr/bin/env python

2

#

3

# html2text.py - converts HTML to text

4

#

5

# Wireshark - Network traffic analyzer

6

# By Gerald Combs <gerald@wireshark.org>

7

8

#

9

# This program is free software; you can redistribute it and/or

10

# modify it under the terms of the GNU General Public License

11

# as published by the Free Software Foundation; either version 2

12

# of the License, or (at your option) any later version.

13

#

14

# This program is distributed in the hope that it will be useful,

15

# but WITHOUT ANY WARRANTY; without even the implied warranty of

16

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

17

# GNU General Public License for more details.

18

#

19

# You should have received a copy of the GNU General Public License

20

# along with this program; if not, write to the Free Software

21

# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

22

23

__author__ = "Peter Wu <peter@lekensteyn.nl>"

24

25

__license__ = "GPL (v2 or later)"

26

27

# TODO:

28

# multiple list indentation levels

29

# maybe allow for ascii output instead of utf-8?

30

31

import sys

32

from textwrap import TextWrapper

33

try:

34

from HTMLParser import HTMLParser

35

from htmlentitydefs import name2codepoint

36

except: # Python 3

37

from html.parser import HTMLParser

38

from html.entities import name2codepoint

39

unichr = chr # for html entity handling

40

41

class TextHTMLParser(HTMLParser):

42

"""Converts a HTML document to text."""

43

def __init__(self):

44

try:

45

# Python 3.4

46

HTMLParser. __init__(self, convert_charrefs=True)

47

except:

48

HTMLParser. __init__(self)

49

# All text, concatenated

50

self.output_buffer = ''

51

# The current text block which is being constructed

52

self.text_block = ''

53

# Whether the previous element was terminated with whitespace

54

self.need_space = False

55

# Whether to prevent word-wrapping the contents (for "pre" tag)

56

self.skip_wrap = False

57

# track list items

58

self.list_item_prefix = None

59

self.ordered_list_index = None

60

# Indentation (for heading and paragraphs)

61

self.indent_levels = [0, 0]

62

63

def _wrap_text(self, text):

64

"""Wraps text, but additionally indent list items."""

65

initial_indent = indent = sum(self.indent_levels) * ' '

66

if self.list_item_prefix:

67

initial_indent += self.list_item_prefix

indent += ' '

kwargs = {

'width': 66,

'initial_indent': initial_indent,

72

'subsequent_indent': indent

73

}

74

if sys.version_info[0:2] >= (2, 6):

75

kwargs['break_on_hyphens'] = False

76

wrapper = TextWrapper(**kwargs)

77

return '\n'.join(wrapper.wrap(text))

78

79

def _commit_block(self, newline='\n\n'):

80

text = self.text_block

81

if text:

82

if not self.skip_wrap:

83

text = self._wrap_text(text)

84

self.output_buffer += text + newline

85

self.text_block = ''

86

self.need_space = False

87

88

def handle_starttag(self, tag, attrs):

89

# end a block of text on <br>, but also flush list items which are not

90

# terminated.

91

if tag == 'br' or tag == 'li':

92

self._commit_block('\n')

93

if tag == 'pre':

94

self.skip_wrap = True

95

# Following list items are numbered.

96

if tag == 'ol':

97

self.ordered_list_index = 1

98

if tag == 'ul':

99

self.list_item_prefix = ' * '

100

if tag == 'li' and self.ordered_list_index:

101

self.list_item_prefix = ' %d. ' % (self.ordered_list_index)

102

self.ordered_list_index += 1

103

if tag[0] == 'h' and len(tag) == 2 and \

104

(tag[1] >= '1' and tag[1] <= '6'):

105

self.indent_levels = [int(tag[1]) - 1, 0]

106

if tag == 'p':

107

self.indent_levels[1] = 1

108

109

def handle_data(self, data):

if self.skip_wrap:

block = data

else:

# For normal text, fold multiple whitespace and strip

114

# leading and trailing spaces for the whole block (but

115

# keep spaces in the middle).

116

block = ''

117

if data.strip() and data[:1].isspace():

118

# Keep spaces in the middle

119

self.need_space = True

120

if self.need_space and data.strip() and self.text_block:

121

block = ' '

122

block += ' '.join(data.split())

123

self.need_space = data[-1:].isspace()

124

self.text_block += block

125

126

def handle_endtag(self, tag):

127

block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'

128

#block_elements += ' dl dd dt'

129

if tag in block_elements.split():

130

self._commit_block()

131

if tag in ('ol', 'ul'):

132

self.list_item_prefix = None

133

self.ordered_list_index = None

134

if tag == 'pre':

135

self.skip_wrap = False

136

137

def handle_charref(self, name):

138

self.handle_data(unichr(int(name)))

139

140

def handle_entityref(self, name):

141

self.handle_data(unichr(name2codepoint[name]))

142

143

def close(self):

144

HTMLParser.close(self)

145

self._commit_block()

146

byte_output = self.output_buffer.encode('utf-8')

147

if hasattr(sys.stdout, 'buffer'):

148

sys.stdout.buffer.write(byte_output)

149

else:

150

sys.stdout.write(byte_output)

def main():

htmlparser = TextHTMLParser()

155

if len(sys.argv) > 1 and sys.argv[1] != '-':

156

filename = sys.argv[1]

157

f = open(filename, 'rb')

else:

filename = None

f = sys.stdin

try:

if hasattr(f, 'buffer'):

163

# Access raw (byte) buffer in Python 3 instead of decoded one

164

f = f.buffer

165

# Read stdin as as Unicode string

166

htmlparser.feed(f.read().decode('utf-8'))

167

finally:

168

if filename is not None:

f.close()

htmlparser.close()

if __name__ == '__main__':

173

sys.exit(main())

nexmon – Blame information for rev 1