nexmon – Blame information for rev 1

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 #!/usr/bin/env python
2 #
3 # html2text.py - converts HTML to text
4 #
5 # Wireshark - Network traffic analyzer
6 # By Gerald Combs <gerald@wireshark.org>
7 # Copyright 1998 Gerald Combs
8 #
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 2
12 # of the License, or (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22  
23 __author__ = "Peter Wu <peter@lekensteyn.nl>"
24 __copyright__ = "Copyright 2015, Peter Wu"
25 __license__ = "GPL (v2 or later)"
26  
27 # TODO:
28 # multiple list indentation levels
29 # maybe allow for ascii output instead of utf-8?
30  
31 import sys
32 from textwrap import TextWrapper
33 try:
34 from HTMLParser import HTMLParser
35 from htmlentitydefs import name2codepoint
36 except: # Python 3
37 from html.parser import HTMLParser
38 from html.entities import name2codepoint
39 unichr = chr # for html entity handling
40  
41 class TextHTMLParser(HTMLParser):
42 """Converts a HTML document to text."""
43 def __init__(self):
44 try:
45 # Python 3.4
46 HTMLParser. __init__(self, convert_charrefs=True)
47 except:
48 HTMLParser. __init__(self)
49 # All text, concatenated
50 self.output_buffer = ''
51 # The current text block which is being constructed
52 self.text_block = ''
53 # Whether the previous element was terminated with whitespace
54 self.need_space = False
55 # Whether to prevent word-wrapping the contents (for "pre" tag)
56 self.skip_wrap = False
57 # track list items
58 self.list_item_prefix = None
59 self.ordered_list_index = None
60 # Indentation (for heading and paragraphs)
61 self.indent_levels = [0, 0]
62  
63 def _wrap_text(self, text):
64 """Wraps text, but additionally indent list items."""
65 initial_indent = indent = sum(self.indent_levels) * ' '
66 if self.list_item_prefix:
67 initial_indent += self.list_item_prefix
68 indent += ' '
69 kwargs = {
70 'width': 66,
71 'initial_indent': initial_indent,
72 'subsequent_indent': indent
73 }
74 if sys.version_info[0:2] >= (2, 6):
75 kwargs['break_on_hyphens'] = False
76 wrapper = TextWrapper(**kwargs)
77 return '\n'.join(wrapper.wrap(text))
78  
79 def _commit_block(self, newline='\n\n'):
80 text = self.text_block
81 if text:
82 if not self.skip_wrap:
83 text = self._wrap_text(text)
84 self.output_buffer += text + newline
85 self.text_block = ''
86 self.need_space = False
87  
88 def handle_starttag(self, tag, attrs):
89 # end a block of text on <br>, but also flush list items which are not
90 # terminated.
91 if tag == 'br' or tag == 'li':
92 self._commit_block('\n')
93 if tag == 'pre':
94 self.skip_wrap = True
95 # Following list items are numbered.
96 if tag == 'ol':
97 self.ordered_list_index = 1
98 if tag == 'ul':
99 self.list_item_prefix = ' * '
100 if tag == 'li' and self.ordered_list_index:
101 self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
102 self.ordered_list_index += 1
103 if tag[0] == 'h' and len(tag) == 2 and \
104 (tag[1] >= '1' and tag[1] <= '6'):
105 self.indent_levels = [int(tag[1]) - 1, 0]
106 if tag == 'p':
107 self.indent_levels[1] = 1
108  
109 def handle_data(self, data):
110 if self.skip_wrap:
111 block = data
112 else:
113 # For normal text, fold multiple whitespace and strip
114 # leading and trailing spaces for the whole block (but
115 # keep spaces in the middle).
116 block = ''
117 if data.strip() and data[:1].isspace():
118 # Keep spaces in the middle
119 self.need_space = True
120 if self.need_space and data.strip() and self.text_block:
121 block = ' '
122 block += ' '.join(data.split())
123 self.need_space = data[-1:].isspace()
124 self.text_block += block
125  
126 def handle_endtag(self, tag):
127 block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
128 #block_elements += ' dl dd dt'
129 if tag in block_elements.split():
130 self._commit_block()
131 if tag in ('ol', 'ul'):
132 self.list_item_prefix = None
133 self.ordered_list_index = None
134 if tag == 'pre':
135 self.skip_wrap = False
136  
137 def handle_charref(self, name):
138 self.handle_data(unichr(int(name)))
139  
140 def handle_entityref(self, name):
141 self.handle_data(unichr(name2codepoint[name]))
142  
143 def close(self):
144 HTMLParser.close(self)
145 self._commit_block()
146 byte_output = self.output_buffer.encode('utf-8')
147 if hasattr(sys.stdout, 'buffer'):
148 sys.stdout.buffer.write(byte_output)
149 else:
150 sys.stdout.write(byte_output)
151  
152  
153 def main():
154 htmlparser = TextHTMLParser()
155 if len(sys.argv) > 1 and sys.argv[1] != '-':
156 filename = sys.argv[1]
157 f = open(filename, 'rb')
158 else:
159 filename = None
160 f = sys.stdin
161 try:
162 if hasattr(f, 'buffer'):
163 # Access raw (byte) buffer in Python 3 instead of decoded one
164 f = f.buffer
165 # Read stdin as as Unicode string
166 htmlparser.feed(f.read().decode('utf-8'))
167 finally:
168 if filename is not None:
169 f.close()
170 htmlparser.close()
171  
172 if __name__ == '__main__':
173 sys.exit(main())