nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | #!/usr/bin/env python |
2 | # |
||
3 | # html2text.py - converts HTML to text |
||
4 | # |
||
5 | # Wireshark - Network traffic analyzer |
||
6 | # By Gerald Combs <gerald@wireshark.org> |
||
7 | # Copyright 1998 Gerald Combs |
||
8 | # |
||
9 | # This program is free software; you can redistribute it and/or |
||
10 | # modify it under the terms of the GNU General Public License |
||
11 | # as published by the Free Software Foundation; either version 2 |
||
12 | # of the License, or (at your option) any later version. |
||
13 | # |
||
14 | # This program is distributed in the hope that it will be useful, |
||
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
17 | # GNU General Public License for more details. |
||
18 | # |
||
19 | # You should have received a copy of the GNU General Public License |
||
20 | # along with this program; if not, write to the Free Software |
||
21 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||
22 | |||
23 | __author__ = "Peter Wu <peter@lekensteyn.nl>" |
||
24 | __copyright__ = "Copyright 2015, Peter Wu" |
||
25 | __license__ = "GPL (v2 or later)" |
||
26 | |||
27 | # TODO: |
||
28 | # multiple list indentation levels |
||
29 | # maybe allow for ascii output instead of utf-8? |
||
30 | |||
31 | import sys |
||
32 | from textwrap import TextWrapper |
||
33 | try: |
||
34 | from HTMLParser import HTMLParser |
||
35 | from htmlentitydefs import name2codepoint |
||
36 | except: # Python 3 |
||
37 | from html.parser import HTMLParser |
||
38 | from html.entities import name2codepoint |
||
39 | unichr = chr # for html entity handling |
||
40 | |||
41 | class TextHTMLParser(HTMLParser): |
||
42 | """Converts a HTML document to text.""" |
||
43 | def __init__(self): |
||
44 | try: |
||
45 | # Python 3.4 |
||
46 | HTMLParser. __init__(self, convert_charrefs=True) |
||
47 | except: |
||
48 | HTMLParser. __init__(self) |
||
49 | # All text, concatenated |
||
50 | self.output_buffer = '' |
||
51 | # The current text block which is being constructed |
||
52 | self.text_block = '' |
||
53 | # Whether the previous element was terminated with whitespace |
||
54 | self.need_space = False |
||
55 | # Whether to prevent word-wrapping the contents (for "pre" tag) |
||
56 | self.skip_wrap = False |
||
57 | # track list items |
||
58 | self.list_item_prefix = None |
||
59 | self.ordered_list_index = None |
||
60 | # Indentation (for heading and paragraphs) |
||
61 | self.indent_levels = [0, 0] |
||
62 | |||
63 | def _wrap_text(self, text): |
||
64 | """Wraps text, but additionally indent list items.""" |
||
65 | initial_indent = indent = sum(self.indent_levels) * ' ' |
||
66 | if self.list_item_prefix: |
||
67 | initial_indent += self.list_item_prefix |
||
68 | indent += ' ' |
||
69 | kwargs = { |
||
70 | 'width': 66, |
||
71 | 'initial_indent': initial_indent, |
||
72 | 'subsequent_indent': indent |
||
73 | } |
||
74 | if sys.version_info[0:2] >= (2, 6): |
||
75 | kwargs['break_on_hyphens'] = False |
||
76 | wrapper = TextWrapper(**kwargs) |
||
77 | return '\n'.join(wrapper.wrap(text)) |
||
78 | |||
79 | def _commit_block(self, newline='\n\n'): |
||
80 | text = self.text_block |
||
81 | if text: |
||
82 | if not self.skip_wrap: |
||
83 | text = self._wrap_text(text) |
||
84 | self.output_buffer += text + newline |
||
85 | self.text_block = '' |
||
86 | self.need_space = False |
||
87 | |||
88 | def handle_starttag(self, tag, attrs): |
||
89 | # end a block of text on <br>, but also flush list items which are not |
||
90 | # terminated. |
||
91 | if tag == 'br' or tag == 'li': |
||
92 | self._commit_block('\n') |
||
93 | if tag == 'pre': |
||
94 | self.skip_wrap = True |
||
95 | # Following list items are numbered. |
||
96 | if tag == 'ol': |
||
97 | self.ordered_list_index = 1 |
||
98 | if tag == 'ul': |
||
99 | self.list_item_prefix = ' * ' |
||
100 | if tag == 'li' and self.ordered_list_index: |
||
101 | self.list_item_prefix = ' %d. ' % (self.ordered_list_index) |
||
102 | self.ordered_list_index += 1 |
||
103 | if tag[0] == 'h' and len(tag) == 2 and \ |
||
104 | (tag[1] >= '1' and tag[1] <= '6'): |
||
105 | self.indent_levels = [int(tag[1]) - 1, 0] |
||
106 | if tag == 'p': |
||
107 | self.indent_levels[1] = 1 |
||
108 | |||
109 | def handle_data(self, data): |
||
110 | if self.skip_wrap: |
||
111 | block = data |
||
112 | else: |
||
113 | # For normal text, fold multiple whitespace and strip |
||
114 | # leading and trailing spaces for the whole block (but |
||
115 | # keep spaces in the middle). |
||
116 | block = '' |
||
117 | if data.strip() and data[:1].isspace(): |
||
118 | # Keep spaces in the middle |
||
119 | self.need_space = True |
||
120 | if self.need_space and data.strip() and self.text_block: |
||
121 | block = ' ' |
||
122 | block += ' '.join(data.split()) |
||
123 | self.need_space = data[-1:].isspace() |
||
124 | self.text_block += block |
||
125 | |||
126 | def handle_endtag(self, tag): |
||
127 | block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6' |
||
128 | #block_elements += ' dl dd dt' |
||
129 | if tag in block_elements.split(): |
||
130 | self._commit_block() |
||
131 | if tag in ('ol', 'ul'): |
||
132 | self.list_item_prefix = None |
||
133 | self.ordered_list_index = None |
||
134 | if tag == 'pre': |
||
135 | self.skip_wrap = False |
||
136 | |||
137 | def handle_charref(self, name): |
||
138 | self.handle_data(unichr(int(name))) |
||
139 | |||
140 | def handle_entityref(self, name): |
||
141 | self.handle_data(unichr(name2codepoint[name])) |
||
142 | |||
143 | def close(self): |
||
144 | HTMLParser.close(self) |
||
145 | self._commit_block() |
||
146 | byte_output = self.output_buffer.encode('utf-8') |
||
147 | if hasattr(sys.stdout, 'buffer'): |
||
148 | sys.stdout.buffer.write(byte_output) |
||
149 | else: |
||
150 | sys.stdout.write(byte_output) |
||
151 | |||
152 | |||
153 | def main(): |
||
154 | htmlparser = TextHTMLParser() |
||
155 | if len(sys.argv) > 1 and sys.argv[1] != '-': |
||
156 | filename = sys.argv[1] |
||
157 | f = open(filename, 'rb') |
||
158 | else: |
||
159 | filename = None |
||
160 | f = sys.stdin |
||
161 | try: |
||
162 | if hasattr(f, 'buffer'): |
||
163 | # Access raw (byte) buffer in Python 3 instead of decoded one |
||
164 | f = f.buffer |
||
165 | # Read stdin as as Unicode string |
||
166 | htmlparser.feed(f.read().decode('utf-8')) |
||
167 | finally: |
||
168 | if filename is not None: |
||
169 | f.close() |
||
170 | htmlparser.close() |
||
171 | |||
172 | if __name__ == '__main__': |
||
173 | sys.exit(main()) |