diff --git a/examples/writing/htmlToRtf15.py b/examples/writing/htmlToRtf15.py new file mode 100644 index 0000000..df7e8d5 --- /dev/null +++ b/examples/writing/htmlToRtf15.py @@ -0,0 +1,12 @@ +from pyth.plugins.xhtml.reader import XHTMLReader +from pyth.plugins.rtf15.writer import Rtf15Writer +import sys + +if len(sys.argv) > 1: + filename = sys.argv[1] +else: + filename = "tests/html/sample-with-image.html" +source = open(filename, "rb") +doc = XHTMLReader.read(source) + +print Rtf15Writer.write(doc).getvalue() diff --git a/examples/writing/rtf15ToXhtml.py b/examples/writing/rtf15ToXhtml.py new file mode 100644 index 0000000..d350e9d --- /dev/null +++ b/examples/writing/rtf15ToXhtml.py @@ -0,0 +1,12 @@ +from pyth.plugins.xhtml.writer import XHTMLWriter +from pyth.plugins.rtf15.reader import Rtf15Reader +import sys + +if len(sys.argv) > 1: + filename = sys.argv[1] +else: + filename = "tests/rtfs/sample-with-image.rtf" +source = open(filename, "rb") +doc = Rtf15Reader.read(source) + +print XHTMLWriter.write(doc).getvalue() diff --git a/pyth/document.py b/pyth/document.py index 864d519..d0cb6d8 100644 --- a/pyth/document.py +++ b/pyth/document.py @@ -7,7 +7,7 @@ class _PythBase(object): def __init__(self, properties={}, content=[]): self.properties = {} self.content = [] - + for (k,v) in properties.iteritems(): self[k] = v @@ -33,7 +33,7 @@ def append(self, item): If the item is of the wrong type, and if this element has a sub-type, then try to create such a sub-type and insert the item into that, instead. - + This happens recursively, so (in python-markup): L [ u'Foo' ] actually creates: @@ -51,7 +51,7 @@ def append(self, item): okay = False else: okay = False - + if not okay: raise TypeError("Wrong content type for %s: %s (%s)" % ( self.__class__.__name__, repr(type(item)), repr(item))) @@ -94,10 +94,10 @@ class Image(Paragraph): """ An image is stored in bytes. All properties of images from the rtf definition are allowed. """ - - validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', - 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', - 'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', + + validProperties = ('emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', 'dibitmap', + 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', 'picw', 'pich', 'picwgoal', + 'pichgoal', 'picscalex', 'picscaley', 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap') contentType = bytes @@ -122,7 +122,7 @@ class List(Paragraph): validProperties = () contentType = ListEntry - + class Document(_PythBase): @@ -130,6 +130,6 @@ class Document(_PythBase): Top-level item. One document is exactly one file. Documents consist of a list of paragraphs. """ - + validProperties = ('title', 'subject', 'author') contentType = Paragraph diff --git a/pyth/plugins/plaintext/writer.py b/pyth/plugins/plaintext/writer.py index 9dd8bfd..a00dc25 100644 --- a/pyth/plugins/plaintext/writer.py +++ b/pyth/plugins/plaintext/writer.py @@ -46,7 +46,8 @@ def go(self): def paragraph(self, paragraph, prefix=""): content = [] for text in paragraph.content: - content.append(u"".join(text.content)) + if text.__class__ != document.Image: + content.append(u"".join(text.content)) content = u"".join(content).encode("utf-8") for line in content.split("\n"): diff --git a/pyth/plugins/rtf15/reader.py b/pyth/plugins/rtf15/reader.py index 7a74162..5e8aca0 100644 --- a/pyth/plugins/rtf15/reader.py +++ b/pyth/plugins/rtf15/reader.py @@ -58,9 +58,9 @@ # All the ones named by number in my 2.6 encodings dir, and those listed above _CODEPAGES_BY_NUMBER = dict( - (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, + (x, "cp%s" % x) for x in (37, 424, 437, 500, 737, 775, 850, 852, 855, 856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, - 875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, + 875, 932, 936, 949, 950, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361)) # Miscellaneous, incomplete @@ -157,7 +157,7 @@ def getControl(self): if not next: break - if first and next in '\\{}': + if first and next in '\\{}~_-': chars.extend("control_symbol") digits.append(next) break @@ -224,7 +224,7 @@ def __init__(self, doc, clean_paragraphs=True): def flushRun(self): if self.block is None: self.block = document.Paragraph() - + if self.isImage: self.block.content.append( document.Image(self.propStack[-1].copy(), @@ -317,11 +317,16 @@ def handle_Para(self, para): self.listStack.append(l) elif self.listLevel < prevListLevel: - l = self.listStack.pop() - self.listStack[-1].append(l) + times = prevListLevel + 1 + if self.listLevel is not None: + times = times - (self.listLevel + 1) + depth = len(self.listStack) - 1 + for _ in xrange(min(times, depth)): + l = self.listStack.pop() + self.listStack[-1].append(l) self.block = None - + def handle_Pict(self, pict): self.flushRun() self.isImage = True @@ -354,7 +359,7 @@ def handle_ImageMarker(self, marker): del self.propStack[-1][marker.name] else: self.propStack[-1][marker.name] = True - + class Group(object): @@ -386,7 +391,7 @@ def __init__(self, reader, parent=None, charsetTable=None): def flushChars(self): - chars = "".join(self.charBuffer).decode(self.charset, self.reader.errors) + chars = u"".join(c.decode(self.charset, self.reader.errors) for c in self.charBuffer) self.content.append(chars) self.charBuffer = [] @@ -398,11 +403,11 @@ def handle(self, control, digits): if control == '*': self.destination = True return - - if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', - 'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', - 'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', - 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', + + if self.image and control in ['emfblip', 'pngblip', 'jpegblip', 'macpict', 'pmmetafile', 'wmetafile', + 'dibitmap', 'wbitmap', 'wbmbitspixel', 'wbmplanes', 'wbmwidthbytes', + 'picw', 'pich', 'picwgoal', 'pichgoal', 'picscalex', 'picscaley', + 'picscaled', 'piccropt', 'piccropb', 'piccropr', 'piccropl', 'picbmp', 'picbpp', 'bin', 'blipupi', 'blipuid', 'bliptag', 'wbitmap']: self.content.append(ImageMarker(control, digits)) return @@ -602,12 +607,12 @@ def handle_strike(self, onOff=None): def handle_ilvl(self, level): - if self.currentParaTag is not None: - self.currentParaTag.listLevel = level - else: - # Well, now we're in trouble. But I'm pretty sure this - # isn't supposed to happen anyway. - pass + if self.currentParaTag is None: + # this can happen where documents open straight with lists rather than a containing Para.. + p = Para() + self.content.append(p) + self.currentParaTag = p + self.currentParaTag.listLevel = int(level) def handle_up(self, amount): @@ -650,7 +655,7 @@ def handle_tab(self): def handle_trowd(self): self.content.append(u'\n') - + #Handle the image tag def handle_pict(self): p = Pict() @@ -658,7 +663,7 @@ def handle_pict(self): self.image = p #Remove the destination control group of the parent, so that the image is preserved self.parent.destination = False - + def handle_field(self): def finalize(): if len(self.content) != 2: @@ -745,7 +750,7 @@ def __init__(self): def __repr__(self): return "!Image!" - + class Para(ReadableMarker): listLevel = None diff --git a/pyth/plugins/rtf15/writer.py b/pyth/plugins/rtf15/writer.py index bf00511..b33b153 100644 --- a/pyth/plugins/rtf15/writer.py +++ b/pyth/plugins/rtf15/writer.py @@ -4,6 +4,7 @@ http://www.biblioscape.com/rtf15_spec.htm """ +import binascii from pyth import document from pyth.format import PythWriter @@ -55,12 +56,16 @@ def __init__(self, doc, target, family): document.List: self._list, document.Paragraph: self._paragraph } + self.paragraphContentDispatch = { + document.Text: self._text, + document.Image: self._image, + } def go(self): self.listLevel = -1 self.addSpacing = None - + self.target.write('{') self._writeHeader() self._writeDocument() @@ -105,7 +110,7 @@ def _getFontTable(self): # We need Symbol for list bullets output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1)) self.symbolFontNumber = i+1 - + output.append('}') return "".join(output) @@ -138,7 +143,7 @@ def _getListTable(self): output.append('}}') return "".join(output) - + def _getListOverrides(self): # I have no idea what the point is of this, @@ -153,7 +158,7 @@ def _getRevTable(self): # ----------------------------------------------- # Document section - + def _writeDocument(self): @@ -193,14 +198,15 @@ def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING): if self.addSpacing is not None: self.target.write(r'\sb%d' % self.addSpacing) self.addSpacing = None - + # Space after the paragraph, # expressed in units of god-knows-what self.target.write(r'\sa%d{' % spacing) - - for text in paragraph.content: - self._text(text) - + + for item in paragraph.content: + handler = self.paragraphContentDispatch[item.__class__] + handler(item) + self.target.write(r'}\par\pard' '\n') @@ -241,23 +247,26 @@ def _text(self, text): for prop in text.properties: if prop in _styleFlags: props.append(_styleFlags[prop]) - + if props: self.target.write("".join(props) + " ") - - for run in text.content: + + for run in text.content: for unichar in run: if unichar == '\n': self.target.write(r'\line ') continue - + # Escape control characters + if unichar in '\\{}': + self.target.write(r'\%s' % unichar) + continue point = ord(unichar) if point < 128: self.target.write(str(unichar)) else: self.target.write(r'\u%d?' % point) - + if props: self.target.write("".join("%s0" % p for p in props) + " ") @@ -266,3 +275,21 @@ def _text(self, text): if 'url' in text.properties: self.target.write('}}') + + def _image(self, image): + self.target.write(r'{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict') + properties = "".join('\\' + prop + (val if val != True else '') for prop, val in image.properties.iteritems()) + self.target.write(properties) + self.target.write(' \n') + image_data = binascii.hexlify(image.content[0]) + for i in chunk(image_data): + self.target.write(i) + self.target.write('\n') + self.target.write(r'}}}}') + +def chunk(data, size=200): + length = len(data) + end = 0 + while length > end: + end = end + size + yield data[end-size:end] diff --git a/pyth/plugins/xhtml/css.py b/pyth/plugins/xhtml/css.py index e2fe5be..e1e44eb 100644 --- a/pyth/plugins/xhtml/css.py +++ b/pyth/plugins/xhtml/css.py @@ -135,3 +135,6 @@ def is_super(self, node): properties = self.get_properties(node) return properties.get('vertical-align') == 'super' + def is_underline(self, node): + properties = self.get_properties(node) + return properties.get('text-decoration') == 'underline' diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py index 775bf58..312845e 100644 --- a/pyth/plugins/xhtml/reader.py +++ b/pyth/plugins/xhtml/reader.py @@ -2,6 +2,8 @@ Read documents from xhtml """ +import base64 + import BeautifulSoup from pyth import document @@ -9,6 +11,9 @@ from pyth.plugins.xhtml.css import CSS +BASE64_PNG_IMG_SRC = 'data:image/png;base64,' + + class XHTMLReader(PythReader): @classmethod @@ -80,6 +85,10 @@ def is_italic(self, node): return (node.findParent(['em', 'i']) is not None or self.css.is_italic(node)) + def is_underline(self, node): + return (node.findParent(['u']) is not None or + self.css.is_underline(node)) + def is_sub(self, node): """ Return true if the BeautifulSoup node needs to be rendered as @@ -110,6 +119,21 @@ def url(self, node): else: return self.link_callback(a_node.get('href')) + def dimensions(self, node): + """ + return (int(width), int(height)) in pixels if a node has these declared in px in a style attribute, else None for either + or both attributes + """ + try: + style = node['style'] + except KeyError: + return None, None + else: + declarations = self.css.parse_declarations(style) + width = _parse_px(declarations.get('width', None)) + height = _parse_px(declarations.get('height', None)) + return width, height + def process_text(self, node): """ Return a pyth Text object from a BeautifulSoup node or None if @@ -125,6 +149,8 @@ def process_text(self, node): properties['bold'] = True if self.is_italic(node): properties['italic'] = True + if self.is_underline(node): + properties['underline'] = True if self.url(node): properties['url'] = self.url(node) if self.is_sub(node): @@ -151,7 +177,7 @@ def process_into(self, node, obj): new_obj = document.Paragraph() obj.append(new_obj) obj = new_obj - elif node.name == 'ul': + elif node.name in ('ul', 'ol'): # add a new list new_obj = document.List() obj.append(new_obj) @@ -161,5 +187,34 @@ def process_into(self, node, obj): new_obj = document.ListEntry() obj.append(new_obj) obj = new_obj + elif node.name == 'img': + if node.get('src', '').startswith(BASE64_PNG_IMG_SRC): + base64_data = node['src'][len(BASE64_PNG_IMG_SRC):] + new_obj = document.Image() + new_obj.append(base64.b64decode(base64_data)) + new_obj['pngblip'] = True + width, height = self.dimensions(node) + if height: + height = unicode(_px_to_twips(height)) + new_obj['pich'] = height + new_obj['pichgoal'] = height + if width: + width = unicode(_px_to_twips(width)) + new_obj['picw'] = width + new_obj['picwgoal'] = width + new_obj['picscalex'] = '100' + new_obj['picscaley'] = '100' + + obj.content.append(new_obj) + return # img is not allowed to have children as per DTD for child in node: self.process_into(child, obj) + + +def _parse_px(node): + if node and node.lower().endswith('px'): + return int(node[:-2]) + + +def _px_to_twips(px): + return px * 15 diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py index 37bca07..38e907b 100644 --- a/pyth/plugins/xhtml/writer.py +++ b/pyth/plugins/xhtml/writer.py @@ -6,6 +6,8 @@ from pyth import document from pyth.format import PythWriter +import base64 +import re from cStringIO import StringIO @@ -14,6 +16,8 @@ 'bold': 'strong', 'italic': 'em', 'underline': 'u', # ? + 'super': 'sup', + 'sub': 'sub', } @@ -50,26 +54,31 @@ def __init__(self, doc, target, cssClasses=True, pretty=False): document.List: self._list, document.Paragraph: self._paragraph } - + self.paragraphContentDispatch = { + document.Text: self._text, + document.Image: self._image, + } + def go(self): self.listLevel = -1 - + tag = Tag("div") - + for element in self.document.content: handler = self.paragraphDispatch[element.__class__] tag.content.extend(handler(element)) tag.render(self.target) return self.target - + def _paragraph(self, paragraph): p = Tag("p") - for text in paragraph.content: - p.content.append(self._text(text)) + for item in paragraph.content: + handler = self.paragraphContentDispatch[item.__class__] + p.content.append(handler(item)) if self.pretty: return [_prettyBreak, p, _prettyBreak] @@ -79,21 +88,32 @@ def _paragraph(self, paragraph): def _list(self, lst): self.listLevel += 1 - + ul = Tag("ul") if self.cssClasses: ul.attrs['class'] = 'pyth_list_%s' % self.listLevel - + + last_li = None for entry in lst.content: li = Tag("li") for element in entry.content: + # in practice list elements always have only one content child? handler = self.paragraphDispatch[element.__class__] - li.content.extend(handler(element)) - ul.content.append(li) + if handler == self._list: + # this is a sublist, so we shouldn't create an empty li, but rather append ul to prior li. + # Lists can't be immediately sublisted (e.g. there must be at least something at outer level) + # but if that is not the case the last_li will be None and next line will bomb out, which is a + # useful implicit assertion + last_li.content.extend(handler(element)) + else: + li.content.extend(handler(element)) + last_li = li + if li.content: # li might be empty.. + ul.content.append(li) self.listLevel -= 1 - + return [ul] @@ -106,31 +126,41 @@ def _text(self, text): current = tag - for prop in ('bold', 'italic', 'underline'): + for prop in ('bold', 'italic', 'underline', 'sub', 'super'): if prop in text.properties: newTag = Tag(_tagNames[prop]) current.content.append(newTag) current = newTag - for prop in ('sub', 'super'): - if prop in text.properties: - if current.tag is None: - newTag = Tag("span") - current.content.append(newTag) - current = newTag - current.attrs['style'] = "vertical-align: %s; font-size: smaller" % prop - current.content.append(u"".join(text.content)) return tag + def _image(self, image): + if image.properties.get(u'pngblip'): + tag = Tag("img") + image_data = bytearray.fromhex(image.content[0]) + base64_image = base64.b64encode(image_data) + tag.attrs['src'] = "data:image/png;base64,{}".format(base64_image) + height = image['pichgoal'] + width = image['picwgoal'] + if width or height: + styles = [] + styles.append(_twips_to_style_px('width', width)) + styles.append(_twips_to_style_px('height', height)) + style = ';'.join(s for s in styles if s) + if style: + tag.attrs['style'] = style + return tag + else: + return Tag(None) _prettyBreak = object() class Tag(object): - + def __init__(self, tag, attrs=None, content=None): self.tag = tag self.attrs = attrs or {} @@ -155,13 +185,13 @@ def render(self, target): if self.tag is not None: target.write('' % self.tag) - + def attrString(self): return " ".join( '%s="%s"' % (k, quoteAttr(v)) for (k, v) in self.attrs.iteritems()) - + def __repr__(self): return "T(%s)[%s]" % (self.tag, repr(self.content)) @@ -169,8 +199,8 @@ def __repr__(self): def quoteText(text): - return text.replace( - u"&", u"&").replace( + return re.sub( + u'&(?!(amp|lt|gt);)', u'&', text, flags=re.IGNORECASE).replace( u"<", u"<").replace( u">", u">") @@ -179,3 +209,12 @@ def quoteAttr(text): return quoteText(text).replace( u'"', u""").replace( u"'", u"'") + + +def _twips_to_style_px(tag, twips): + try: + twips = int(twips) + except ValueError: + pass + px = int(round(twips / 15.0)) + return "{}:{}px".format(tag, px) diff --git a/tests/html/sample-with-image.html b/tests/html/sample-with-image.html new file mode 100644 index 0000000..f1eea0e --- /dev/null +++ b/tests/html/sample-with-image.html @@ -0,0 +1,3 @@ +

+ +

This is a pretty boring graphic...

diff --git a/tests/rtfs/control_chars.rtf b/tests/rtfs/control_chars.rtf new file mode 100644 index 0000000..a5cfe6e --- /dev/null +++ b/tests/rtfs/control_chars.rtf @@ -0,0 +1,11 @@ +{\rtf1\ansi\ansicpg1252\cocoartf1348\cocoasubrtf170 +{\fonttbl\f0\froman\fcharset0 TimesNewRomanPSMT;} +{\colortbl;\red255\green255\blue255;} +{\info +{\author Kris Powell}}\paperw11900\paperh16840\margl1134\margr1134\margb1134\margt1134\vieww10800\viewh8400\viewkind0 +\deftab709 +\pard\pardeftab709 + +\f0\fs14 \cf0 \~\~\~\~\~\~\~\~\~ NB Spaces +\fs24 \ +} diff --git a/tests/rtfs/list-bug.rtf b/tests/rtfs/list-bug.rtf new file mode 100644 index 0000000..8598069 --- /dev/null +++ b/tests/rtfs/list-bug.rtf @@ -0,0 +1,10 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{Start}\par\pard +\ilvl0\ls0\li720\s1\sa50{1}\par\pard +\ilvl0\ls0\li720\s1\ilvl1\ls0\li1440\s1\sa50{1.1}\par\pard +} \ No newline at end of file diff --git a/tests/rtfs/sample-with-image.rtf b/tests/rtfs/sample-with-image.rtf new file mode 100644 index 0000000..f1bf87f --- /dev/null +++ b/tests/rtfs/sample-with-image.rtf @@ -0,0 +1,58 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{{\field{\*\fldinst{\f0\fs20\cf0 INCLUDEPICTURE "cid:image001.png@01CDC656.1C7FFF50" \\* MERGEFORMATINET }}{\fldrslt{\*\shppict{\pict\pngblip\picw20714\pich12143\picwgoal750\pichgoal750\picscalex100\picscaley100 +89504e470d0a1a0a0000000d4948445200000032000000320802000000915d1fe60000001974455874536f6674776172650041646f626520496d616765526561647971c9653c000012224944415478da6c5969705bd7757e1b1ef0b08320008204094a94 +44ad94146d966551926539719cc9d8711cb5ced8aeed643259da69279349ffb43ffa3f3f5a77927192c64be224b6c793269d249225db916359d52e532b49893b259220419004b1bcb5dfb9f701843c7d241f1f1edebbf7dcb37ce73be78a27cf9c910441 +1045894eecc047c711f0e3d88e6d3b9665d9b66d9ab6659938dbf86ff17b0e1ea3671d7e21d446c085543bf0516687a228f5b3c7e369fc88c770c11fe66f2992880f823b80c83e60749a8d4d685b36be820438591686e002f1c36938b8345ca0ff57acfa +a1b0e33337f124ce5c2c9ce97b41a20f3293890f4a923145088ee230dd08b68d5fb3a6272e13ce5c5b42eda8bf5e9789cfc73f364a561785abcafdc86ee1354526ed89fc8e24d6ecc86cca4c499393392d0bf229dc843569ea76ac1f62c3b1224a4d20d8 +8e4ba4783c7545e28c216860db2e5775c332755dc7430a1789bd4ec3e141a1e66122938c8bc0e573ea2ec56dcd745c3f049a83ffd0b3a669e998c630ad4aa5aaebe50a8e6aa9b45c5a2e4282f9426171610937abe55271b9582a959797972be5b25ed521 +9507e24332a65bb6327789aeab89ae8f99b41cf8bc655574bd5ac1eb98a98a53b158c470cb980da32f2d2d2e154b4b98a558aa94f145b954c2c39849af560d03ba30995e6cb604d206294f9110021e55852a558fea0f0515bfdfeff5fa1cd222a636f0b6 +5ea914b19ca5e262a1802997968b8b0b8b986d61610157a5e54ab18cc9b0ac0a9ec64ca66938964dca43e80822e6a1b57a6032d5e3f57a3d9e6024a2aab8a61fd5e3f1fa3c3e5f805d2a7457f5422cc803fd903b29a416657272ec9db7df5ac8e7971617 +4b581969a0ac1b3a96449e0b4349221ee4b3e0d7a7aa7eaf2f9648a85e1593fabc3e85eee30f27955c95d42fe104c71149f5e41518c5e141c07d106a9078546102918cee7e10795c2bbf78e5c7b76fde4ab6a4429a3f118b6175980ad3610e9c318da0d0 +f0f5a8c19f43e12632edc0fb58d4da1cba388cf1a96afe2988163d648be49c165e71c3c9665e42115ec33a1a885f8a4a6961f1ab478f1a887dd8db11fd9a0fafb3c8546451b619a46226fe1a7db4b0345b70d7c66ed7a25174f1cea9c1a228f13ba40e89 +0494b84ed8e48e847b02138e81a523887ca99223da4a341a9a5fcc9f38f61e5c78e3860dfbf6ef377513b6c6c3b61bf3520d296b73f338e0ffa12f8966e7a2f1f0845a113d5c363c04ef8333c3fff0837b2c50452627dec567d2111b99d99549aea4dbda +c6c7c717e6f388c54c5b3b5095204362fec09f6b4026d734ae96d858129340100cf27ad2ac484a1170d3a378e60af9fffde493e9e9a95853fcc17dbd4dd1a8699922d90a8a72642610cb0fa4c93aacd0c491586c726202cb0a8542a996a469c0822e72b9 +4a61d73c0188ec971c93a39524f97d1a4001bf707c1792c9ad49e28a5efae31f7e77fdcaa5d242e1fae58bfffdee5b954a4911c8901843769fb4e997a7515180dbc04fe1f612623d3f9d83865bd3695f40c37ac9f3b8078b2b3aaa69c7b52b0b50d11ff0 +dfb875fd3ffffd472fffc78f667339c42297090742f3faf56b93e3e3871e7ee49bdffedec1478e0c0f0e5cb97451f579b100a34ad882a06510ef0069302b14592e97c8634447191d1903f4e1eb8ece2c734c8bfc5056803758856512aabb8ecd424fe0be +250a7eaffffcf9b3efbefd1be0682cd6047fe2be8c4129b90bcec8c8705b7bfbaebd0f41053b76ef9e9b9ece643bf285fcc9e3c707fb6f41d33b77ef79f8f0a30b85f95fbdfe8b7822919b9aea5cd3f5e4579e06fc2a77060780d8d16834996ab50c1db3 +03a4f2f9d9997b33e9b64c381c064e40b78263ea701ccb61fe247a7dbe531f7ef83f7f7817ab4ea7db9e7bfec5543255d12b4c2c925cafe8f999d9f68e0ee08a0edb89f2e34f7c05283b3a3c74f5d2f9751b369855fdfd637f5cd7bd3e120e2fcccf8d8f +8d00e8b76cdb4a6ee688cadd8949407532d5a205346406c061219f7ff38dd7a6a7a72391483416c357adad99b6d634ae4281a0d7ab017cde3ffee753278e032e5677adf9fa732fc6e34dc87675e24124cdb690155a906d30282dc6f1c8925e2eb7a4dbfe +e1fb3f54fdfe1b57fbae5ebb86508b45c248698954eab9175e8ac713d56a054f2b482f58e0aad5ab5996255f46686ddabc054c612e9f07eedf9d9cbc7cf102503b180cc5e3cded1d9dc8368337af46c2a1ced55d5f7ae2e9803f50ae9499470a752821ef +17a4fc5c9eb144c8290cddeacfaeca22584f9d3a75e1ec698c8f108187c247aad5eab6ee8d994c7679b928309751f00a749269ef0473017b80a182c1c0a38f7d71efbefd172e9ebf74eedcfcfc1c7c19105f5a5e5e5c58181eba9d88c70381e0ba75eb7b +8f3c0ab6020650833316b39208eea37a7de94ce6ea95cb53d3d3edd98eeb7d7d3ffbc9cb4f3e7d341c8a7cf4c189a7be76146ef3c66bafc2896d4690303e14ebd452036241c864da43b118923ba70d0846785b20183afcd8e777ecdc73f9c2d98be7cf17 +e6f32aa55d8fa6f97c9a0f597fe79ebd08255d376b99c3a945a99b0f76eede7de1ec99d77ff6caba8d9baef75d41b8b477b44fdf9dc21cf1644b6e7a0aaebdb4b4e4b470a6643b6e60d1490e07037bf6ed4ba65b6d96ac084e6ab9046ee1557d6bd6addd +b67d27e071746498e82c63737098544b3a914c4131624302e0078219a9ac3999d434edeae54b83b76e00e08f7cf1f19dbb1ed08281db83fd278ffde9eec438b40081d676af7ffff8b1ceaeae8d9b36598645c0039787f735275236210186c23dfcd512a6 +23319e4cb6cde566e0ba00177ce3a95621faf0f09deecd5bb8938bb50cc95087a736c1a8ea0ff51e5cbb665d6e360704814da11e84f64bdffaeed09dc1544b0ba81834a179b5279e7caa359b3574d385224c087e92027f48b782ab5292146c85d287043a +8678f67ad579169837ae5d8b44a3bb76ef9d9c9c204e6718a0783ddb77a81ea59ec879a9c28c6073ed61a5a17004d1871482cce8305ae7f5696ded1dfe40b0a9298148c7649bb66c6d4a342306eb1942818c8303b737f7f4b075db2ccbc92096ef1dfb33 +949c4ca62e5dbc786770b0399138fab7cfaedfb03197cbf5dfba01da03589f999eea5abddab60c8eb20e21b5c519bccd6a210490c90e52a5e382312ca3331e5257ec6fde7c152cedb12f3fc1408be889224ade7bf7ee8155476251aa64482a796971617e +76a65098bfdd7f0b33013e9efcead732994e70d703870edfb93d800519863e3a34d4bd765d55340497dfd03488a762a5a4f9fc965905f585f34149a08c3271075137ab848e301eb2888135c8786368700089f871501ac1644694e46853b4522a2512894c +a655b04dbfc751453b1e8d6eded2a36981b9fc1cea90dede831b37f7942a1518056e0e254d4c8c73c6bffd733b3821e356446400845efff94f2f9c3b73eae409000a3c1a5a1918e8ffe4e353a3c3b7a3d158281c2e954b7079afe605f507f7edbb745151 +d55d0fec05af55542fa45292a9e6f1e191e1dbfd3b766c0da862c0231141410ccaf2debd7b8ba5e299d3a7612f3204712b111aed3d78f8d6cdeb60f1f7262791d182e188639b5434d428cf42210f8911c27ffdcb07ad6d992d5bb7bdf3e66b284b60b6ab +7d57bef38fdf9fcfcffdfcc72fb7b56780d57ff78d6f828b23e5f9fcbe1b9f5e39fdd14719a4acaeb55d10e3eeddb1e2423ee055648e40c4686c53aff6f46ca58a44d3eaf520687e5b26bbed73bb4cc3c05a474747504df0f8136b1c03a21ffec2175efa +cedf23c75dbe74cea7695f7fe1c57ffed77f7bf685976641beee4d7a58693f393ed6d9d9190a863121a0646a72f2ed377f79ebc6d53688d59ec9002151508d0f0d0324d9ec9caa0b28efb460f8a9bf79e6817d0f9986e9d681a288e4f360efa1603884d4 +34313ece35c488aa532fa9037e3f987ba63d5b2a1671a157cd777ffbeb3ffdfef748622883117018a4f7e0c3dffda71f245bd2c0269472effcfa0d18f799e75fd8dcb34d4239d6b92aab69eaf8c408c5700d1c2d723f72d24ca6031a6eac9d4d96da77ec +debb7dc7cefdbd07e07c9cc2d78a1aa29c5024822b3f9703279bcdcdbcf5cb574545dad4d3c3cb147823349c5db59ab2ba8de428c31fee0cdeeeeaeedebe6b37318ed2e27ccfd64dd5c5d9e5a5b97ca1d01c8bc14f0c11b40b9c16ef520b4268e0f23cc8 +ab95f2c1c39f578919cb703bb1ce1659ee40c9047f9f181dedbb7ce9e0238f9a16a2b6baefc02124afd3a73ec4a88cbcf16a9dd5c60005494ca55bc68686468687b3ed596926976b69c9a8fe6059372726264d59adda227ecd06599cfbaef93f9b6a3568 +8e42a15166e23020f7487f63a343ebd66f40ca4fb7a4bb376d7cfd959fbcf15f3f05e1b979ed2a2a7418911321bc00dc819b3fff8d6fc1b81fffe50380147ccf585e2ac7536d63c3c3fdfd036b37f5d88cc8e29098397826a9e1653d2b8b6e614828e9f6 +9e1cce971d0105f7ee071eeceade4015802ce3c9a79f79f65adfe5603052989f03fb40142553692411067512386338dadcb1ba0b91f4f147a7f6ec3b201f7a6cede0cddcfededeeeeef589745a0bf8c10b563a1f84fb128f7b5ecbd7132001af4b66ea3c +9f9207fe4cdb6ccb74c463cdb43ca0a965831a655775a553a96cb633d9d2a269fe9d7b1e44a264a58303246f6dcf249b53e17064626214be2bfef05f7e70f2d87b95521914af39918c354571118a45fd9adfe355e1a05498b3aaac6648c94dc88ec3c474 +0b545cdaac870897f6692a2297da4c54ce501642aa25f0e68597bb2ab790c1fb1e9f06af80bfcaa45a01a45259df16eb7af6cb63e313c313b9d1f1bb63a3c38852b0284478b4a909a910472c9e884442d470f0fa2556150abcb3546b1909f57625819655 +2eeb54c63bcc1deaf52515fcc06a87ca0f3202fb4a96f18d5e29332d4bc82258046651aa9657f606b66c5f7de0481c69cb308d898989fe8181dcececd4bde991a1c18be7ce12dcf911e9815024dc04dede9c686e8e8723b14020e04501a278681e6465c7 +6601cf628bf4005c71bd10df93dbd9acc3d6580f53cf8c5a2f502a2d82752b607505660627d58d2a90838fe6517c5bb76e87abe201d46ec74e9c4031c0ea32074f8e0edde9fbf4b2a91bc05e5fc01f430d924cc1eed15853380a4183288a18cb65baa346 +9d650af57ea62d415924ac830b81372698b8964048c6b4087a252860e2bc53c0691c16223ba64158456a47eaf50723fe2814e964db338f1f3982548894303f0f1a868a0bdc661a35d6b5be4fe961d913080640b0c0f582e160341a0f45a2c15048f311db +2637c3d8f8b34413eb752c184d642c1265bf440c938533c8270082fc835a3524399e9308d0991158e3854e540ac1f79d542ac53bcdb148241e8b896bd66018d01eb0c2e272319f87a0f3a0fca8c396160b333353a6811a55843ffa7cfe4038ec0f069b22 +2178772018f607020a0b781e4660e716812ccb6022ca0b5b6186b728542422e10e7164ea24c852bd2b447587eaf1a652adac5b6919ac952baed4d812ea994838da99edc41b00d80ab530cb85420167ea4f2c158164f3a5e5b9a9bbec4589f4a7028b7c5e +90d480e6d1c05e433eafeca130473d817ac8341c2e87b3d216e7b4117696f9c601b4ea516150f246cb16564a1dc771db42221165c741c4b0deaf0dfa8a10161911629b0d26f23acee54a05851dce4090858582313b8b57f0aaeaf5c0592123ea2a54cd8a +2051df88359604de97e12ab099a11db7c725701fa6e6374c4e14836d11105631af666d5a56a450b107c7b558df96170c0e0b3e9fcf872547a311e226bac91660a32840e6e1cd511407f003488c5b0a4f1aac33477cc1a43e62bd98a184c305e61b1d8481 +3419a83845184f880c2d45dea667b45bb42dcb6ee8dddfd7bee7398ba11ca844c0ef4175ce5bf8d069a9441d83a56251e14c0690435537ef1eb27a9bef96b0144348cfe042a07e8269886cb7c526dbca1cac85866d1f6765d3850383bdd2b373847a1ee3 +0bc730ba61d70a0129180c82fef7f46c56589fc8625d4644a94c1d310c4492501504e5d1b2101d14dbac296bf396a22b7a7d2fa3610b6865ebc569d8ddb8af432634ea8ebe052cfbfd1ae0595515c3a8287008d66a658997051a01ad65713e27bbad5187 +2fcd550b45abf899ada89a40aed9dcf956aafffbe4a81f9086762a0883a8722eeb55dbb164c35498d6010e000f912738e6dfac370dfdb174ccacc88cc8a6c50336db77e13288b566baebe235f534eee6352a8665c2fa260e4fafb661d2d476d9ca93c757 +14782705147729d1a59716cb336e9f9c18035c8b7535791e733829a0015967dd551be535c7edb5366ee5d58563e85377347733892fc474375d1c2634083f671faeafba1e20b11d00eea480530fee2882e5dcc7d8057b65cba9b6d3e9344cf6d9adbc9a11 +5748aed3e07c342919d3fdf47f020c004820324f63cfbeff0000000049454e44ae426082 +}}}}}\par\pard +\sa150{This is a \i pretty\i0 \b boring\b0 \u160?graphic...}\par\pard +} diff --git a/tests/rtfs/text-attributes.rtf b/tests/rtfs/text-attributes.rtf new file mode 100644 index 0000000..ae2a60a --- /dev/null +++ b/tests/rtfs/text-attributes.rtf @@ -0,0 +1,8 @@ +{\rtf1\ansi\deff1 +{\fonttbl{\f0\fswiss Calibri;}{\f1\froman Times New Roman;}{\f2\fnil\fprq0\fcharset128 Symbol;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;} +{\stylesheet{\s1 List Paragraph;}} +{\*\listtable{\list\listid1\listtemplateid1{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0{\leveltext \'01\u61623 ?;}\fi-180\f2}}} +{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}} +\sa150{\ul Underlined\ul0 }\par\pard +} \ No newline at end of file diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py index e29d283..c8b6807 100644 --- a/tests/test_readrtf15.py +++ b/tests/test_readrtf15.py @@ -30,6 +30,61 @@ class TestRtfFile(unittest.TestCase): pass +class TestRtfWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") + with open(sample_with_image, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image)) + expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143', + 'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'} + self.assertEquals(expected, image.properties) + +class TestRtfWithNonbreakingSpaces(unittest.TestCase): + + def test_tildes_are_parsed(self): + sample_with_tildes = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "control_chars.rtf") + with open(sample_with_tildes, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + traverse_text(doc, lambda text: self.assertNotIn('~', text)) + + +class TestNestedLists(unittest.TestCase): + + def test_when_last_item_sublist_item(self): + """ With structures like this, both lists were getting dropped + Start + * 1 + * 1.1 + """ + list_bug = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "list-bug.rtf") + with open(list_bug, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + text = [] + traverse_text(doc, lambda x: text.append(x)) + self.assertIn('Start', text) + self.assertIn('1', text) + self.assertIn('1.1', text) + + +class TestTextProperties(unittest.TestCase): + + def test_reads_underline(self): + text = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "text-attributes.rtf") + with open(text, 'rb') as rtf: + doc = Rtf15Reader.read(rtf) + underlined = doc.content[0].content[0] + self.assertTrue(underlined['underline']) + + +def traverse_text(element, function): + if element.__class__ == pyth.document.Text: + map(function, element.content) + else: + for child in element.content: + traverse_text(child, function) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py index 978c277..9814fd9 100644 --- a/tests/test_readxhtml.py +++ b/tests/test_readxhtml.py @@ -53,6 +53,24 @@ def test_italic(self): text = doc.content[0].content[0] assert text['italic'] + def test_underline(self): + """ + Try to read a paragraph containing underline + """ + xhtml = "

sub

" + doc = XHTMLReader.read(xhtml) + text = doc.content[0].content[0] + assert text['underline'] + + def test_underline_styling(self): + """ + Try to read a paragraph containing underline via CSS + """ + xhtml = '

underline

' + doc = XHTMLReader.read(xhtml) + text = doc.content[0].content[0] + assert text['underline'] + def test_sub(self): """ Try to read a paragraph containing subscript @@ -80,6 +98,22 @@ def test_url(self): text = doc.content[0].content[0] assert text['url'] == "http://google.com" + def test_inline_png(self): + pixels = 50 + twips = pixels * 15 + height = width = str(twips) # in retrospect choosing a square image wasn't a great idea :) + with open('tests/html/sample-with-image.html', 'rb') as xhtml: + doc = XHTMLReader.read(xhtml) + image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image)) + self.assertEquals(image.content[0][1:4], u'PNG') + self.assertEquals(image['pngblip'], True) + self.assertEquals(image['pich'], height) + self.assertEquals(image['pichgoal'], height) + self.assertEquals(image['picw'], width) + self.assertEquals(image['picwgoal'], width) + self.assertEquals(image['picscaley'], '100') + self.assertEquals(image['picscalex'], '100') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_writertf15.py b/tests/test_writertf15.py new file mode 100644 index 0000000..d44e1c0 --- /dev/null +++ b/tests/test_writertf15.py @@ -0,0 +1,28 @@ +import os +import unittest +from pyth.plugins.xhtml.reader import XHTMLReader +from pyth.plugins.rtf15.writer import Rtf15Writer +from pyth.document import Document, Paragraph, Text + +class TestRtfWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "html", "sample-with-image.html") + with open(sample_with_image, 'rb') as rtf: + source = XHTMLReader.read(rtf) + doc = Rtf15Writer.write(source).getvalue() + self.assertIn('pngblip', doc) + self.assertIn('picwgoal750\\', doc) + self.assertIn('pichgoal750\\', doc) + + + def test_underline_output(self): + text = Text(content=[u'Underlined'], properties={'underline': True}) + para = Paragraph(content=[text]) + doc = Document(content=[para]) + result = Rtf15Writer.write(doc).getvalue() + self.assertIn('\\ul Underlined\\ul0', result) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_writexhtml.py b/tests/test_writexhtml.py new file mode 100644 index 0000000..cc12f40 --- /dev/null +++ b/tests/test_writexhtml.py @@ -0,0 +1,28 @@ +import os +import unittest +from pyth.plugins.rtf15.reader import Rtf15Reader +from pyth.plugins.xhtml.writer import XHTMLWriter +from pyth.document import Document, Paragraph, Text + +class TestHtmlWithImage(unittest.TestCase): + + def test_inline_png(self): + sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf") + with open(sample_with_image, 'rb') as rtf: + source = Rtf15Reader.read(rtf) + doc = XHTMLWriter.write(source).getvalue() + self.assertIn('