diff --git a/src/cmap.rs b/src/cmap.rs index d61f3c31..ed5338d7 100644 --- a/src/cmap.rs +++ b/src/cmap.rs @@ -6,7 +6,7 @@ use lopdf::{Dictionary, Document, Object}; use crate::text::CMap; /// The mapping from a CID to one or more Unicode code points. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct ToUnicodeCMap { pub mappings: BTreeMap>, } @@ -91,7 +91,7 @@ impl ToUnicodeCMap { } /// Generates a CMap string representation suitable for embedding in a PDF. - pub fn to_cmap_string(&self, font_name: &str) -> String { + pub fn to_cmap_string(&self, font_name: &str, single_byte_cids: bool) -> String { // Header section let mut result = format!( "/CIDInit /ProcSet findresource begin\n\n12 dict begin\n\nbegincmap\n\n%!PS-Adobe-3.0 \ @@ -125,7 +125,20 @@ impl ToUnicodeCMap { for chunk in entries.chunks(100) { result.push_str(&format!("{} beginbfchar\n", chunk.len())); for &(cid, unicode) in chunk { - result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode)); + // force 2 byte representation for unicode values <= 0xFFFF, 4 byte otherwise + if unicode <= 0xFFFF { + if single_byte_cids { + result.push_str(&format!("<{:02X}> <{:04X}>\n", cid, unicode)); + } else { + result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode)); + } + } else { + if single_byte_cids { + result.push_str(&format!("<{:02X}> <{:08X}>\n", cid, unicode)); + } else { + result.push_str(&format!("<{:04X}> <{:08X}>\n", cid, unicode)); + } + } } result.push_str("endbfchar\n"); } @@ -158,7 +171,27 @@ fn parse_hex_token(token: &str) -> Result { /// Implement the CMap trait on our ToUnicodeCMap. impl CMap for ToUnicodeCMap { + /// map single byte characters to their unicode representation fn map_bytes(&self, bytes: &[u8]) -> String { + // For simplicity, assume that the byte sequence represents single characters, each 1 byte long. + let mut result = String::new(); + let mut i = 0; + while i < bytes.len() { + let cid = bytes[i] as u32; + if let Some(unis) = self.mappings.get(&cid) { + for &u in unis { + if let Some(ch) = std::char::from_u32(u) { + result.push(ch); + } + } + } + i += 1; + } + result + } + + /// map double byte characters to their unicode representation + fn map_bytes_u16be(&self, bytes: &[u8]) -> String { // For simplicity, assume that the byte sequence represents CIDs in big-endian, // and that each CID is 2 bytes long. let mut result = String::new(); diff --git a/src/deserialize.rs b/src/deserialize.rs index 376f6b2c..c2d22444 100644 --- a/src/deserialize.rs +++ b/src/deserialize.rs @@ -13,7 +13,7 @@ use lopdf::{ use serde_derive::{Deserialize, Serialize}; use crate::{ - BuiltinFont, BuiltinOrExternalFontId, Color, DictItem, ExtendedGraphicsState, ExtendedGraphicsStateId, ExtendedGraphicsStateMap, FontId, LayerInternalId, Line, LineDashPattern, LinePoint, LinkAnnotation, Op, PageAnnotId, PageAnnotMap, PaintMode, ParsedFont, PdfDocument, PdfDocumentInfo, PdfFontMap, PdfLayerMap, PdfMetadata, PdfPage, PdfResources, Point, Polygon, PolygonRing, Pt, RawImage, Rect, RenderingIntent, TextItem, TextMatrix, TextRenderingMode, WindingOrder, XObject, XObjectId, XObjectMap, cmap::ToUnicodeCMap, conformance::PdfConformance, date::{OffsetDateTime, parse_pdf_date} + BuiltinFont, BuiltinOrExternalFontId, Color, DictItem, ExtendedGraphicsState, ExtendedGraphicsStateId, ExtendedGraphicsStateMap, FontId, LayerInternalId, Line, LineDashPattern, LinePoint, LinkAnnotation, Op, PageAnnotId, PageAnnotMap, PaintMode, ParsedFont, ParsedSubsetFont, PdfDocument, PdfDocumentInfo, PdfFontMap, PdfLayerMap, PdfMetadata, PdfPage, PdfResources, PdfSubsetFontMap, Point, Polygon, PolygonRing, Pt, RawImage, Rect, RenderingIntent, TextItem, TextMatrix, TextRenderingMode, WindingOrder, XObject, XObjectId, XObjectMap, cmap::ToUnicodeCMap, conformance::PdfConformance, date::{OffsetDateTime, parse_pdf_date} }; #[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] @@ -241,10 +241,14 @@ fn parse_pdf_from_bytes_end( // Extract ExtGStates from resources let extgstates = extract_extgstates(&doc, &objs_to_search_for_resources, warnings); - let fonts = fonts - .into_iter() + let parsedfonts = fonts + .iter() .filter_map(|(id, pf)| Some((FontId(id.get_id().to_string()), pf.as_parsed_font()?))) .collect(); + let subsetfonts = fonts + .iter() + .filter_map(|(id, pf)| Some((FontId(id.get_id().to_string()), pf.as_parsedsubset_font()?))) + .collect(); // Build the final PdfDocument. let pdf_doc = PdfDocument { @@ -253,7 +257,8 @@ fn parse_pdf_from_bytes_end( xmp: None, }, resources: PdfResources { - fonts: PdfFontMap { map: fonts }, + fonts: PdfFontMap { map: parsedfonts }, + subsetfonts: PdfSubsetFontMap { map: subsetfonts }, xobjects: XObjectMap { map: xobjects }, extgstates: ExtendedGraphicsStateMap { map: extgstates }, layers: PdfLayerMap { @@ -280,19 +285,28 @@ fn parse_pdf_from_bytes_end( pub enum ParsedOrBuiltinFont { P(ParsedFont, Option), B(BuiltinFont), + PS(ParsedSubsetFont, Option), } impl ParsedOrBuiltinFont { - fn as_parsed_font(self) -> Option { + fn as_parsed_font(&self) -> Option { match self { - ParsedOrBuiltinFont::P(p, _) => Some(p), - ParsedOrBuiltinFont::B(_) => None, + ParsedOrBuiltinFont::P(p, _) => Some(p.clone()), + ParsedOrBuiltinFont::B(_) | ParsedOrBuiltinFont::PS( .. ) => None, + } + } + + fn as_parsedsubset_font(&self) -> Option { + match self { + ParsedOrBuiltinFont::P( .. ) | ParsedOrBuiltinFont::B( .. ) => None, + ParsedOrBuiltinFont::PS(p, _) => Some(p.clone()), } } fn cmap(&self) -> Option<&ToUnicodeCMap> { match self { - ParsedOrBuiltinFont::P(_, cmap) => cmap.as_ref(), + ParsedOrBuiltinFont::P(_, cmap) + | ParsedOrBuiltinFont::PS(_, cmap) => cmap.as_ref(), ParsedOrBuiltinFont::B(_) => None, } } @@ -3365,9 +3379,7 @@ mod parsefont { use super::ParsedOrBuiltinFont; use crate::{ - cmap::ToUnicodeCMap, - deserialize::{get_dict_or_resolve_ref, PdfWarnMsg}, - BuiltinFont, FontId, ParsedFont, + BuiltinFont, FontId, ParsedFont, ParsedSubsetFont, ParsedSubsetFontCIDSystemInfo, ParsedSubsetFontCustomEncoding, ParsedSubsetFontDescendantFont, ParsedSubsetFontDescriptorProperties, ParsedSubsetFontProperties, cmap::ToUnicodeCMap, deserialize::{PdfWarnMsg, get_dict_or_resolve_ref} }; /// Main function to parse fonts from PDF resources @@ -3409,17 +3421,168 @@ mod parsefont { }; // Extract ToUnicode CMap directly from the font dictionary (common to all font types) - let to_unicode_cmap = extract_to_unicode_cmap(doc, font_dict, warnings, page_num); + let (to_unicode_cmap, to_unicode_cmap_bytes) = extract_to_unicode_cmap(doc, font_dict, warnings, page_num); // Handle different font types if &font_type == b"Type0" { - if let Some(parsed_font) = - process_type0_font(doc, font_dict, &font_id, warnings, page_num) - { - fonts_map.insert( - font_id, - ParsedOrBuiltinFont::P(parsed_font, to_unicode_cmap), - ); + match process_type0_font(doc, font_dict, &font_id, warnings, page_num) { + Some(ParsedOrBuiltinFont::P(parsed_font, _)) => { + fonts_map.insert( + font_id, + ParsedOrBuiltinFont::P(parsed_font, to_unicode_cmap) + ); + }, + Some(ParsedOrBuiltinFont::PS(mut parsed_font, _)) => { + // remember the original ToUnicode map, needed during serialize + parsed_font.cmap_bytes = to_unicode_cmap_bytes.clone(); + parsed_font.cmap = to_unicode_cmap.clone(); + // remember the font properties + let encoding: Option; + let custom_encoding: Option; + match font_dict.get(b"Encoding") { + Ok(Object::Name(enc)) => { + // standard encoding + encoding = Some(String::from_utf8_lossy(enc).to_string()); + custom_encoding = None; + }, + Ok(Object::Dictionary(dict)) => { + // custom encoding + encoding = None; + let base_encoding = match dict.get(b"BaseEncoding") { + Ok(Object::Name(enc)) => Some(String::from_utf8_lossy(enc).to_string()), + _ => None, + }; + let differences = match dict.get(b"Differences") { + Ok(Object::Array(differences)) => Some(differences.clone()), + _ => None, + }; + custom_encoding = Some(ParsedSubsetFontCustomEncoding { + base_encoding, + differences, + }); + }, + Ok(Object::Reference(r)) => { + // custom encoding + encoding = None; + if let Ok(encoding_dict) = doc.get_object(*r).and_then(|obj| obj.as_dict()) { + let base_encoding = match encoding_dict.get(b"BaseEncoding") { + Ok(Object::Name(enc)) => Some(String::from_utf8_lossy(enc).to_string()), + _ => None, + }; + let differences = match encoding_dict.get(b"Differences") { + Ok(Object::Array(differences)) => Some(differences.clone()), + _ => None, + }; + custom_encoding = Some(ParsedSubsetFontCustomEncoding { + base_encoding, + differences, + }); + } else { + custom_encoding = None; + } + }, + _ => { + encoding = None; + custom_encoding = None; + }, + } + let base_font = match font_dict.get(b"BaseFont") { + Ok(Object::Name(base_font)) => Some(String::from_utf8_lossy(base_font).to_string()), + _ => None, + }; + let descendant_fonts = if font_dict.has(b"DescendantFonts") { + match get_descendant_font_dict(doc, font_dict, &font_id, warnings, page_num) { + Some(descendant_font_dict) => { + let base_font = match descendant_font_dict.get(b"BaseFont") { + Ok(Object::Name(base_font)) => Some(String::from_utf8_lossy(base_font).to_string()), + _ => None, + }; + let subtype = match descendant_font_dict.get(b"Subtype") { + Ok(Object::Name(subtype)) => Some(String::from_utf8_lossy(subtype).to_string()), + _ => None, + }; + let cid_to_gid_map = match descendant_font_dict.get(b"CIDToGIDMap") { + Ok(Object::Name(cid_to_gid_map)) => Some(String::from_utf8_lossy(cid_to_gid_map).to_string()), + _ => None, + }; + let dw = match descendant_font_dict.get(b"DW") { + Ok(Object::Integer(dw)) => Some(*dw), + _ => None, + }; + let cid_system_info = match descendant_font_dict.get(b"CIDSystemInfo") { + Ok(Object::Dictionary(cidsysteminfo_dict)) => { + let ordering = match cidsysteminfo_dict.get(b"Ordering") { + Ok(Object::String(ordering, _)) => Some(String::from_utf8_lossy(ordering).to_string()), + _ => None, + }; + let registry = match cidsysteminfo_dict.get(b"Registry") { + Ok(Object::String(registry, _)) => Some(String::from_utf8_lossy(registry).to_string()), + _ => None, + }; + let supplement = match cidsysteminfo_dict.get(b"Supplement") { + Ok(Object::Integer(supplement)) => Some(*supplement), + _ => None, + }; + Some(ParsedSubsetFontCIDSystemInfo { + ordering, + registry, + supplement, + }) + }, + Ok(Object::Reference(id)) => { + if let Ok(cidsysteminfo_dict) = doc.get_dictionary(*id) { + let ordering = match cidsysteminfo_dict.get(b"Ordering") { + Ok(Object::String(ordering, _)) => Some(String::from_utf8_lossy(ordering).to_string()), + _ => None, + }; + let registry = match cidsysteminfo_dict.get(b"Registry") { + Ok(Object::String(registry, _)) => Some(String::from_utf8_lossy(registry).to_string()), + _ => None, + }; + let supplement = match cidsysteminfo_dict.get(b"Supplement") { + Ok(Object::Integer(supplement)) => Some(*supplement), + _ => None, + }; + Some(ParsedSubsetFontCIDSystemInfo { + ordering, + registry, + supplement, + }) + } else { + None + } + } + _ => None, + }; + Some(vec![ParsedSubsetFontDescendantFont { + base_font, + subtype, + dw, + cid_to_gid_map, + cid_system_info, + }]) + }, + None => None, + } + } else { + None + }; + let font_properties = ParsedSubsetFontProperties { + encoding, + custom_encoding, + first_char: None, + last_char: None, + widths: None, + base_font, + descendant_fonts, + }; + parsed_font.font_properties = font_properties; + fonts_map.insert( + font_id, + ParsedOrBuiltinFont::PS(parsed_font, to_unicode_cmap) + ); + }, + _ => {}, } } else { match process_standard_font(doc, font_dict, &font_id, warnings, page_num) { @@ -3435,6 +3598,87 @@ mod parsefont { ParsedOrBuiltinFont::P(parsed_font, to_unicode_cmap) ); }, + Some(ParsedOrBuiltinFont::PS(mut parsed_font, _)) => { + // remember the original ToUnicode map, needed during serialize + parsed_font.cmap_bytes = to_unicode_cmap_bytes.clone(); + parsed_font.cmap = to_unicode_cmap.clone(); + // remember the font properties + let encoding: Option; + let custom_encoding: Option; + match font_dict.get(b"Encoding") { + Ok(Object::Name(enc)) => { + // standard encoding + encoding = Some(String::from_utf8_lossy(enc).to_string()); + custom_encoding = None; + }, + Ok(Object::Dictionary(dict)) => { + // custom encoding + encoding = None; + let base_encoding = match dict.get(b"BaseEncoding") { + Ok(Object::Name(enc)) => Some(String::from_utf8_lossy(enc).to_string()), + _ => None, + }; + let differences = match dict.get(b"Differences") { + Ok(Object::Array(differences)) => Some(differences.clone()), + _ => None, + }; + custom_encoding = Some(ParsedSubsetFontCustomEncoding { + base_encoding, + differences, + }); + }, + Ok(Object::Reference(r)) => { + // custom encoding + encoding = None; + if let Ok(encoding_dict) = doc.get_object(*r).and_then(|obj| obj.as_dict()) { + let base_encoding = match encoding_dict.get(b"BaseEncoding") { + Ok(Object::Name(enc)) => Some(String::from_utf8_lossy(enc).to_string()), + _ => None, + }; + let differences = match encoding_dict.get(b"Differences") { + Ok(Object::Array(differences)) => Some(differences.clone()), + _ => None, + }; + custom_encoding = Some(ParsedSubsetFontCustomEncoding { + base_encoding, + differences, + }); + } else { + custom_encoding = None; + } + }, + _ => { + encoding = None; + custom_encoding = None; + }, + } + let first_char = match font_dict.get(b"FirstChar") { + Ok(Object::Integer(val)) => Some(*val), + _ => None, + }; + let last_char = match font_dict.get(b"LastChar") { + Ok(Object::Integer(val)) => Some(*val), + _ => None, + }; + let widths = match font_dict.get(b"Widths") { + Ok(Object::Array(widths)) => Some(widths.clone()), + _ => None, + }; + let font_properties = ParsedSubsetFontProperties { + encoding, + custom_encoding, + first_char, + last_char, + widths, + base_font: None, + descendant_fonts: None, + }; + parsed_font.font_properties = font_properties; + fonts_map.insert( + font_id, + ParsedOrBuiltinFont::PS(parsed_font, to_unicode_cmap) + ); + }, None => {} } } @@ -3449,7 +3693,7 @@ mod parsefont { font_dict: &Dictionary, warnings: &mut Vec, page_num: usize, - ) -> Option { + ) -> (Option, Option>) { // Check if font dictionary has a ToUnicode entry if let Ok(to_unicode_ref) = font_dict.get(b"ToUnicode") { // Get the ToUnicode stream @@ -3470,11 +3714,11 @@ mod parsefont { }; // Convert to string - if let Ok(cmap_str) = String::from_utf8(content) { + if let Ok(cmap_str) = String::from_utf8(content.clone()) { // Parse using ToUnicodeCMap::parse match ToUnicodeCMap::parse(&cmap_str) { Ok(cmap) => { - return Some(cmap); + return (Some(cmap), Some(content)); } Err(e) => { warnings.push(PdfWarnMsg::warning( @@ -3488,7 +3732,7 @@ mod parsefont { } } - None + (None, None) } /// Get fonts dictionary from PDF resources @@ -3541,7 +3785,7 @@ mod parsefont { font_id: &FontId, warnings: &mut Vec, page_num: usize, - ) -> Option { + ) -> Option { // Get the descendant font dictionary let descendant_font_dict = get_descendant_font_dict(doc, font_dict, font_id, warnings, page_num)?; @@ -3584,7 +3828,38 @@ mod parsefont { } } }, - Ok(Object::Reference(id)) => doc.get_dictionary(*id).ok(), + Ok(Object::Reference(id)) => { + match doc.get_object(*id) { + Ok(Object::Array(arr)) if !arr.is_empty() => { + // Get first descendant font + match arr[0].as_dict().ok().or_else(|| { + if let Ok(id) = arr[0].as_reference() { + doc.get_dictionary(id).ok() + } else { + None + } + }) { + Some(d) => Some(d), + None => { + warnings.push(PdfWarnMsg::warning( + page_num, + 0, + format!("Cannot resolve descendant font for {}", font_id.0), + )); + None + } + } + }, + _ => { + warnings.push(PdfWarnMsg::warning( + page_num, + 0, + format!("Cannot resolve descendant font for {}", font_id.0), + )); + None + } + } + }, _ => { warnings.push(PdfWarnMsg::warning( page_num, @@ -3628,15 +3903,32 @@ mod parsefont { } } + /// Get the font file stream + fn get_font_file_dictionary<'a>( + doc: &'a Document, + font_file_ref: &'a Object, + _warnings: &mut Vec, + ) -> Option<&'a Dictionary> { + // Get font stream + match font_file_ref { + Object::Stream(s) => Some(&s.dict), + Object::Reference(r) => match doc.get_object(*r) { + Ok(Object::Stream(s)) => Some(&s.dict), + _ => None, + }, + _ => None, + } + } + /// Process font data (extract, decompress, parse) and handle ToUnicode CMap fn process_font_data( doc: &Document, - _font_dict: &Dictionary, + font_dict: &Dictionary, font_descriptor: &Dictionary, font_id: &FontId, warnings: &mut Vec, page_num: usize, - ) -> Option { + ) -> Option { // Try each font file type for font_file_key in &[b"FontFile", b"FontFile2" as &[u8], b"FontFile3"] { if let Ok(font_file_ref) = font_descriptor.get(font_file_key) { @@ -3653,9 +3945,198 @@ mod parsefont { ), )); - // Parse the font + // subsetted fonts have six upper case characters followed by a + sign + if let Some(font_name) = match font_dict.get(b"BaseFont") { + Ok(Object::Name(font_name)) => Some(String::from_utf8_lossy(font_name).to_string()), + _ => None, + } { + let is_subsetted: bool; + if font_name.len() > 7 && font_name.chars().nth(6) == Some('+') { + is_subsetted = font_name.chars().take(6).all(|c| c.is_ascii_uppercase()) + } else { + is_subsetted = false + } + + if is_subsetted { + // check if this is an embedded CID TrueType font + if font_file_key == b"FontFile2" { + if let Some(mut parsedsubset_font) = ParsedSubsetFont::ttf_from_bytes(&font_data, 0, warnings) { + parsedsubset_font.font_name = Some(font_name); + // remember the font descriptor properties + let charset = match font_descriptor.get(b"CharSet") { + Ok(Object::String(charset, _)) => Some(String::from_utf8_lossy(charset).to_string()), + _ => None, + }; + let font_family = match font_descriptor.get(b"FontFamily") { + Ok(Object::String(font_family, _)) => Some(String::from_utf8_lossy(font_family).to_string()), + _ => None, + }; + let font_stretch = match font_descriptor.get(b"FontStretch") { + Ok(Object::Name(font_stretch)) => Some(String::from_utf8_lossy(font_stretch).to_string()), + _ => None, + }; + let ascent = match font_descriptor.get(b"Ascent") { + Ok(Object::Integer(ascent)) => Some(*ascent), + _ => None, + }; + let descent = match font_descriptor.get(b"Descent") { + Ok(Object::Integer(descent)) => Some(*descent), + _ => None, + }; + let cap_height = match font_descriptor.get(b"CapHeight") { + Ok(Object::Integer(cap_height)) => Some(*cap_height), + _ => None, + }; + let flags = match font_descriptor.get(b"Flags") { + Ok(Object::Integer(flags)) => Some(*flags), + _ => None, + }; + let italic_angle = match font_descriptor.get(b"ItalicAngle") { + Ok(Object::Integer(italic_angle)) => Some(*italic_angle), + _ => None, + }; + let font_weight = match font_descriptor.get(b"FontWeight") { + Ok(Object::Integer(font_weight)) => Some(*font_weight), + _ => None, + }; + let stemv = match font_descriptor.get(b"StemV") { + Ok(Object::Integer(stemv)) => Some(*stemv), + _ => None, + }; + let xheight = match font_descriptor.get(b"XHeight") { + Ok(Object::Integer(xheight)) => Some(*xheight), + _ => None, + }; + let font_bbox = match font_descriptor.get(b"FontBBox") { + Ok(Object::Array(font_bbox)) => Some(font_bbox.clone()), + _ => None, + }; + let cid_set = match font_descriptor.get(b"CIDSet") { + Ok(Object::Reference(id)) => match doc.get_object(*id) { + Ok(Object::Stream(stream)) => Some(stream.decompressed_content().unwrap().clone()), + _ => None, + }, + _ => None, + }; + let font_descriptor_properties = ParsedSubsetFontDescriptorProperties { + charset, + font_family, + font_stretch, + ascent, + descent, + cap_height, + flags, + italic_angle, + font_weight, + stemv, + xheight, + font_bbox, + cid_set, + }; + parsedsubset_font.font_descriptor_properties = font_descriptor_properties; + return Some(ParsedOrBuiltinFont::PS(parsedsubset_font, None)); + } else { + warnings.push(PdfWarnMsg::error( + page_num, + 0, + format!("Failed to parse font data for {}", font_id.0), + )); + } + } + // check if this is a CFF (compressed font format) + if font_file_key == b"FontFile3" { + if let Some(font_file_ref_dict) = get_font_file_dictionary(doc, font_file_ref, warnings) { + if let Ok(subtype) = font_file_ref_dict.get(b"Subtype") { + match subtype { + Object::Name(items) => { + if items == b"Type1C" { + if let Some(mut parsedsubset_font) = ParsedSubsetFont::cff_from_bytes(&font_data, 0, warnings) { + // remember the font descriptor properties + let charset = match font_descriptor.get(b"CharSet") { + Ok(Object::String(charset, _)) => Some(String::from_utf8_lossy(charset).to_string()), + _ => None, + }; + let font_family = match font_descriptor.get(b"FontFamily") { + Ok(Object::String(font_family, _)) => Some(String::from_utf8_lossy(font_family).to_string()), + _ => None, + }; + let font_stretch = match font_descriptor.get(b"FontStretch") { + Ok(Object::Name(font_stretch)) => Some(String::from_utf8_lossy(font_stretch).to_string()), + _ => None, + }; + let ascent = match font_descriptor.get(b"Ascent") { + Ok(Object::Integer(ascent)) => Some(*ascent), + _ => None, + }; + let descent = match font_descriptor.get(b"Descent") { + Ok(Object::Integer(descent)) => Some(*descent), + _ => None, + }; + let cap_height = match font_descriptor.get(b"CapHeight") { + Ok(Object::Integer(cap_height)) => Some(*cap_height), + _ => None, + }; + let flags = match font_descriptor.get(b"Flags") { + Ok(Object::Integer(flags)) => Some(*flags), + _ => None, + }; + let italic_angle = match font_descriptor.get(b"ItalicAngle") { + Ok(Object::Integer(italic_angle)) => Some(*italic_angle), + _ => None, + }; + let font_weight = match font_descriptor.get(b"FontWeight") { + Ok(Object::Integer(font_weight)) => Some(*font_weight), + _ => None, + }; + let stemv = match font_descriptor.get(b"StemV") { + Ok(Object::Integer(stemv)) => Some(*stemv), + _ => None, + }; + let xheight = match font_descriptor.get(b"XHeight") { + Ok(Object::Integer(xheight)) => Some(*xheight), + _ => None, + }; + let font_bbox = match font_descriptor.get(b"FontBBox") { + Ok(Object::Array(font_bbox)) => Some(font_bbox.clone()), + _ => None, + }; + let font_descriptor_properties = ParsedSubsetFontDescriptorProperties { + charset, + font_family, + font_stretch, + ascent, + descent, + cap_height, + flags, + italic_angle, + font_weight, + stemv, + xheight, + font_bbox, + cid_set: None, + }; + parsedsubset_font.font_descriptor_properties = font_descriptor_properties; + return Some(ParsedOrBuiltinFont::PS(parsedsubset_font, None)); + } else { + warnings.push(PdfWarnMsg::error( + page_num, + 0, + format!("Failed to parse font data for {}", font_id.0), + )); + } + } + }, + _ => {} + } + } + } + } + } + } + + // Parse the font (not subsetted, should be a full font) if let Some(parsed_font) = ParsedFont::from_bytes(&font_data, 0, warnings) { - return Some(parsed_font); + return Some(ParsedOrBuiltinFont::P(parsed_font, None)); } else { warnings.push(PdfWarnMsg::error( page_num, @@ -3698,7 +4179,7 @@ mod parsefont { font_id: &FontId, warnings: &mut Vec, page_num: usize, - ) -> Option { + ) -> Option { // Get the font descriptor let font_descriptor = get_font_descriptor(doc, font_dict, font_id, warnings, page_num)?; @@ -3727,7 +4208,7 @@ mod parsefont { None => { match process_type1_font(doc, font_dict, font_id, warnings, page_num) { Some(parsed_font) => { - Some(ParsedOrBuiltinFont::P(parsed_font, None)) + Some(parsed_font) }, None => { warnings.push(PdfWarnMsg::warning( diff --git a/src/font.rs b/src/font.rs index 3d0dca33..f47ee96e 100644 --- a/src/font.rs +++ b/src/font.rs @@ -304,6 +304,8 @@ impl BuiltinFont { pub enum FontType { OpenTypeCFF(Vec), OpenTypeCFF2, + ParsedEmbeddedType0(Vec), + ParsedEmbeddedType1C(Vec), #[default] TrueType, } @@ -438,6 +440,69 @@ impl ParsedFont { } } +#[derive(Clone, Default)] +pub struct ParsedSubsetFontCustomEncoding { + pub base_encoding: Option, + pub differences: Option>, +} + +#[derive(Clone, Default)] +pub struct ParsedSubsetFontCIDSystemInfo { + pub ordering: Option, + pub registry: Option, + pub supplement: Option, +} + +#[derive(Clone, Default)] +pub struct ParsedSubsetFontDescendantFont { + pub base_font: Option, + pub subtype: Option, + pub dw: Option, + pub cid_to_gid_map: Option, + pub cid_system_info: Option, +} + +#[derive(Clone, Default)] +pub struct ParsedSubsetFontProperties { + pub encoding: Option, + pub custom_encoding: Option, + pub first_char: Option, + pub last_char: Option, + pub widths: Option>, + pub base_font: Option, + pub descendant_fonts: Option>, +} + +#[derive(Clone, Default)] +pub struct ParsedSubsetFontDescriptorProperties { + pub charset: Option, + pub font_family: Option, + pub font_stretch: Option, + pub ascent: Option, + pub descent: Option, + pub cap_height: Option, + pub flags: Option, + pub italic_angle: Option, + pub font_weight: Option, + pub stemv: Option, + pub xheight: Option, + pub font_bbox: Option>, + pub cid_set: Option>, +} + +/// In contrast to ParsedFont this font was embedded as a subset font and therefore cannot be used to shape new text elements. +/// When serializing the PDF this font is re-embedded as is, keeping the parsed text elements intact. +#[derive(Clone, Default)] +pub struct ParsedSubsetFont { + pub original_bytes: Vec, + pub font_type: FontType, + pub font_name: Option, + pub cmap: Option, + pub cmap_bytes: Option>, + pub font_properties: ParsedSubsetFontProperties, + pub font_descriptor_properties: ParsedSubsetFontDescriptorProperties, +} + pub trait PrepFont { fn lgi(&self, codepoint: u32) -> Option; @@ -510,6 +575,19 @@ impl fmt::Debug for ParsedFont { } } +impl PartialEq for ParsedSubsetFont { + fn eq(&self, other: &Self) -> bool { + self.original_bytes.len() == other.original_bytes.len() + } +} + +impl fmt::Debug for ParsedSubsetFont { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ParsedSubsetFont") + .finish() + } +} + #[derive(Debug, Clone)] pub struct SubsetFont { pub bytes: Vec, @@ -692,7 +770,7 @@ impl ParsedFont { // Create the CMap and generate its string representation let cmap = ToUnicodeCMap { mappings }; - cmap.to_cmap_string(&font_id.0) + cmap.to_cmap_string(&font_id.0, false) } pub(crate) fn generate_gid_to_cid_map(&self, glyph_ids: &[(u16, char)]) -> Vec<(u16, u16)> { @@ -1614,6 +1692,68 @@ impl ParsedFont { } } +impl ParsedSubsetFont { + pub fn ttf_from_bytes( + font_bytes: &[u8], + _font_index: usize, + warnings: &mut Vec, + ) -> Option { + warnings.push(PdfWarnMsg::info( + 0, + 0, + "Successfully read embedded TrueType font data".to_string(), + )); + let parsedsubset_font = ParsedSubsetFont{ + original_bytes: font_bytes.to_vec(), + font_type: FontType::ParsedEmbeddedType0(font_bytes.to_vec()), + font_name: None, + cmap: None, + cmap_bytes: None, + font_properties: ParsedSubsetFontProperties::default(), + font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(), + }; + Some(parsedsubset_font) + } + + pub fn cff_from_bytes( + font_bytes: &[u8], + _font_index: usize, + warnings: &mut Vec, + ) -> Option { + let scope = allsorts_subset_browser::binary::read::ReadScope::new(font_bytes); + let cff = match scope.read::>() { + Ok(cff) => { + warnings.push(PdfWarnMsg::info( + 0, + 0, + "Successfully read embedded CFF font data".to_string(), + )); + cff + } + Err(e) => { + warnings.push(PdfWarnMsg::warning( + 0, + 0, + format!("Failed to read embedded CFF font data: {}", e), + )); + return None; + } + }; + let font_name = cff.name_index.iter().next() + .and_then(|val| Some(String::from_utf8_lossy(val).to_string())); + let parsedsubset_font = ParsedSubsetFont{ + original_bytes: font_bytes.to_vec(), + font_type: FontType::ParsedEmbeddedType1C(font_bytes.to_vec()), + font_name, + cmap: None, + cmap_bytes: None, + font_properties: ParsedSubsetFontProperties::default(), + font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(), + }; + Some(parsedsubset_font) + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(C)] pub struct FontMetrics { diff --git a/src/lib.rs b/src/lib.rs index 3c32363d..fcc8a221 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -449,6 +449,9 @@ pub struct PdfResources { /// Fonts found in the PDF file, indexed by the sha256 of their contents #[serde(default)] pub fonts: PdfFontMap, + /// Fonts found embedded as subset fonts in the PDF file, indexed by the sha256 of their contents + #[serde(skip)] + pub subsetfonts: PdfSubsetFontMap, /// XObjects (forms, images, embedded PDF contents, etc.) #[serde(default)] pub xobjects: XObjectMap, @@ -487,6 +490,11 @@ pub struct PdfFontMap { pub map: BTreeMap, } +#[derive(Debug, PartialEq, Default, Clone)] +pub struct PdfSubsetFontMap { + pub map: BTreeMap, +} + #[derive(Debug, PartialEq, Default, Clone)] pub struct ParsedIccProfile {} diff --git a/src/serialize.rs b/src/serialize.rs index eae7d70e..5868ca77 100644 --- a/src/serialize.rs +++ b/src/serialize.rs @@ -13,12 +13,7 @@ use lopdf::{ use serde_derive::{Deserialize, Serialize}; use crate::{ - color::IccProfile, - font::{FontType, SubsetFont}, - Actions, BuiltinFont, Color, ColorArray, Destination, FontId, IccProfileType, - ImageOptimizationOptions, Line, LinkAnnotation, Op, PaintMode, ParsedFont, PdfDocument, - PdfDocumentInfo, PdfPage, PdfResources, PdfWarnMsg, Polygon, PrepFont, TextItem, XObject, - XObjectId, + Actions, BuiltinFont, Color, ColorArray, Destination, FontId, IccProfileType, ImageOptimizationOptions, Line, LinkAnnotation, Op, PaintMode, ParsedFont, ParsedSubsetFont, PdfDocument, PdfDocumentInfo, PdfPage, PdfResources, PdfWarnMsg, Polygon, PrepFont, TextItem, XObject, XObjectId, color::IccProfile, font::{FontType, SubsetFont} }; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)] @@ -213,6 +208,13 @@ pub fn to_lopdf_doc( global_font_dict.set(font_id.0.clone(), Reference(font_dict_id)); } + let prepared_subsetfonts = prepare_subsetfonts(&pdf.resources, &pdf.pages, warnings); + for (font_id, prepared) in prepared_subsetfonts.iter() { + let font_dict = add_subsetfont_to_pdf(&mut doc, font_id, prepared); + let font_dict_id = doc.add_object(font_dict); + global_font_dict.set(font_id.0.clone(), Reference(font_dict_id)); + } + for internal_font in get_used_internal_fonts(&pdf.pages) { let font_dict = builtin_font_to_dict(&internal_font); let font_dict_id = doc.add_object(font_dict); @@ -288,6 +290,7 @@ pub fn to_lopdf_doc( let layer_stream = translate_operations( &page.ops, &prepared_fonts, + &prepared_subsetfonts, &pdf.resources.xobjects.map, opts.secure, warnings, @@ -433,6 +436,7 @@ fn builtin_font_to_dict(font: &BuiltinFont) -> LoDictionary { pub(crate) fn translate_operations( ops: &[Op], fonts: &BTreeMap, + subsetfonts: &BTreeMap, xobjects: &BTreeMap, secure: bool, warnings: &mut Vec, @@ -484,11 +488,14 @@ pub(crate) fn translate_operations( content.push(LoOp::new("ET", vec![])); } Op::WriteTextBuiltinFont { items, font } => { - encode_text_items_to_pdf::(items, None, Some(font), &mut content); + encode_text_items_to_pdf::(items, None, None, Some(font), &mut content); } Op::WriteText { items, font } => { if let Some(prepared_font) = fonts.get(font) { - encode_text_items_to_pdf(items, Some(prepared_font), None, &mut content); + encode_text_items_to_pdf(items, Some(prepared_font), None, None, &mut content); + } + if let Some(prepared_subsetfont) = subsetfonts.get(font) { + encode_text_items_to_pdf::(items, None, Some(prepared_subsetfont), None, &mut content); } } Op::WriteCodepoints { font, cp } => { @@ -723,6 +730,7 @@ pub(crate) fn translate_operations( fn encode_text_items_to_pdf( items: &[TextItem], prepared_font: Option<&T>, + prepared_subsetfont: Option<&PreparedSubsetFont>, builtin_font: Option<&BuiltinFont>, content: &mut Vec, ) { @@ -755,6 +763,52 @@ fn encode_text_items_to_pdf( vec![] }; + // Custom fonts must use hexadecimal encoding in PDF + tj_array.push(LoString(bytes, Hexadecimal)); + } else if let Some(font) = prepared_subsetfont { + // For embedded subset fonts, convert each character to its subset glyph ID + let bytes = if true { + match font.original.font_type { + FontType::ParsedEmbeddedType0( .. ) => { + // Type0 embedded subset fonts use two bytes per character + text.chars() + .flat_map(|c| { + font.original.cmap.as_ref().unwrap().mappings + .iter() + .find(|(_, unicodechar)| { + let c = c as u32; + unicodechar.contains(&c) + }) + .map(|(cid, _)| *cid as u16) + .unwrap_or(0) + .to_be_bytes() + }) + .collect() + }, + FontType::ParsedEmbeddedType1C( .. ) => { + // Type1C embedded subset fonts use one byte per character + text.chars() + .map(|c| { + font.original.cmap.as_ref().unwrap().mappings + .iter() + .find(|(_, unicodechar)| { + let c = c as u32; + unicodechar.contains(&c) + }) + .map(|(cid, _)| *cid as u8) + .unwrap_or(0) + }) + .collect() + }, + _ => unimplemented!(), + } + } else { + // This branch is for reference/comparison but not used + // It would try to use lopdf::Document::encode_text if it supported + // UnicodeMapEncoding + vec![] + }; + // Custom fonts must use hexadecimal encoding in PDF tj_array.push(LoString(bytes, Hexadecimal)); } else if builtin_font.is_some() { @@ -905,6 +959,10 @@ impl PrepFont for PreparedFont { } } +pub(crate) struct PreparedSubsetFont { + original: ParsedSubsetFont, +} + const DEFAULT_CHARACTER_WIDTH: i64 = 1000; fn line_to_stream_ops(line: &Line) -> Vec { @@ -1161,6 +1219,25 @@ pub(crate) fn prepare_fonts( fonts_in_pdf } +pub(crate) fn prepare_subsetfonts( + resources: &PdfResources, + _pages: &[PdfPage], + _warnings: &mut Vec, +) -> BTreeMap { + let mut fonts_in_pdf = BTreeMap::new(); + + for (font_id, font) in resources.subsetfonts.map.iter() { + let prepared_font = PreparedSubsetFont{ + original: font.clone(), + }; + + fonts_in_pdf.insert(font_id.clone(), prepared_font); + } + + fonts_in_pdf +} + + fn add_font_to_pdf( doc: &mut lopdf::Document, font_id: &FontId, @@ -1172,7 +1249,9 @@ fn add_font_to_pdf( .clone() .unwrap_or(font_id.0.clone()); - let face_name = format!("{}+{}", font_id.0.clone().get(0..6).unwrap(), font_name); + // font ids are US-Ascii only, so `chars()` will always be on a character boundary + // this will make the font as subsetted + let face_name = format!("{}+{}", font_id.0.clone().chars().take(6).collect::(), font_name); let vertical = prepared.vertical_writing; @@ -1204,6 +1283,9 @@ fn add_font_to_pdf( ("FontFile2", Reference(doc.add_object(font_stream))), ) } + _ => { + unimplemented!() + } }; LoDictionary::from_iter(vec![ @@ -1282,6 +1364,221 @@ fn add_font_to_pdf( ]) } +fn add_subsetfont_to_pdf( + doc: &mut lopdf::Document, + font_id: &FontId, + prepared: &PreparedSubsetFont, +) -> LoDictionary { + let font_name = prepared + .original + .font_name + .clone() + .unwrap_or(font_id.0.clone()); + + // previously embedded subset fonts found during parsing already have the correct face_name + let face_name = font_name.clone(); + + let use_single_byte_for_cmap: bool; + let (sub_type, font_tuple) = match &prepared.original.font_type { + FontType::ParsedEmbeddedType0(buf) => { + // WARNING: Font stream MAY NOT be compressed + let font_stream = LoStream::new( + LoDictionary::new(), + buf.clone(), + ) + .with_compression(false); + use_single_byte_for_cmap = false; + ( + "Type0", + ("FontFile2", Reference(doc.add_object(font_stream))), + ) + }, + FontType::ParsedEmbeddedType1C(buf) => { + // WARNING: Font stream MAY NOT be compressed + let font_stream = LoStream::new( + LoDictionary::from_iter(vec![("Subtype", Name("Type1C".into()))]), + buf.clone(), + ) + .with_compression(false); + use_single_byte_for_cmap = true; + ( + "Type1", + ("FontFile3", Reference(doc.add_object(font_stream))), + ) + }, + _ => unimplemented!() + }; + + let mut font_vec = vec![ + ("Type", Name("Font".into())), + ("Subtype", Name(sub_type.into())), + ("BaseFont", Name(face_name.clone().into_bytes())), + ]; + if let Some(ref cmap_bytes) = prepared.original.cmap_bytes { + font_vec.push(( + "ToUnicode", + Reference(doc.add_object(LoStream::new( + LoDictionary::new(), + cmap_bytes.clone(), + ))) + )); + } else if let Some(ref cmap) = prepared.original.cmap { + font_vec.push(( + "ToUnicode", + Reference(doc.add_object(LoStream::new( + LoDictionary::new(), + cmap.to_cmap_string(&face_name, use_single_byte_for_cmap).as_bytes().to_vec(), + ))) + )); + } + if let Some(ref encoding) = prepared.original.font_properties.encoding { + font_vec.push(( "Encoding", Name(encoding.clone().into_bytes()))); + } + if let Some(ref custom_encoding) = prepared.original.font_properties.custom_encoding { + let mut custom_encoding_vec = vec![ + ("Type", Name("Encoding".into())), + ]; + if let Some(ref base_encoding) = custom_encoding.base_encoding { + custom_encoding_vec.push(( "BaseEncoding", Name(base_encoding.clone().into_bytes()))); + } + if let Some(ref differences) = custom_encoding.differences { + custom_encoding_vec.push(( + "Differences", + Array(differences.clone()), + )); + } + font_vec.push(( + "Encoding", + Reference( + doc.add_object(LoDictionary::from_iter(custom_encoding_vec)), + ), + )); + } + if let Some(first_char) = prepared.original.font_properties.first_char { + font_vec.push(( "FirstChar", Integer(first_char))); + } + if let Some(last_char) = prepared.original.font_properties.last_char { + font_vec.push(( "LastChar", Integer(last_char))); + } + if let Some(ref widths) = prepared.original.font_properties.widths { + font_vec.push(( + "Widths", + Array(widths.clone()), + )); + } + + let mut font_descriptor_vec = vec![ + ("Type", Name("FontDescriptor".into())), + ("FontName", Name(font_name.clone().into_bytes())), + font_tuple, + ]; + if let Some(ref stemv) = prepared.original.font_descriptor_properties.charset { + font_descriptor_vec.push(( "CharSet", LoString(stemv.clone().into_bytes(), Literal))); + } + if let Some(ref font_family) = prepared.original.font_descriptor_properties.font_family { + font_descriptor_vec.push(( "FontFamily", LoString(font_family.clone().into_bytes(), Literal))); + } + if let Some(ref font_stretch) = prepared.original.font_descriptor_properties.font_stretch { + font_descriptor_vec.push(( "FontStretch", Name(font_stretch.clone().into_bytes()))); + } + if let Some(cap_height) = prepared.original.font_descriptor_properties.cap_height { + font_descriptor_vec.push(( "CapHeight", Integer(cap_height))); + } + if let Some(ascent) = prepared.original.font_descriptor_properties.ascent { + font_descriptor_vec.push(( "Ascent", Integer(ascent))); + } + if let Some(descent) = prepared.original.font_descriptor_properties.descent { + font_descriptor_vec.push(( "Descent", Integer(descent))); + } + if let Some(italic_angle) = prepared.original.font_descriptor_properties.italic_angle { + font_descriptor_vec.push(( "ItalicAngle", Integer(italic_angle))); + } + if let Some(flags) = prepared.original.font_descriptor_properties.flags { + font_descriptor_vec.push(( "Flags", Integer(flags))); + } + if let Some(font_weight) = prepared.original.font_descriptor_properties.font_weight { + font_descriptor_vec.push(( "FontWeight", Integer(font_weight))); + } + if let Some(stemv) = prepared.original.font_descriptor_properties.stemv { + font_descriptor_vec.push(( "StemV", Integer(stemv))); + } + if let Some(xheight) = prepared.original.font_descriptor_properties.xheight { + font_descriptor_vec.push(( "XHeight", Integer(xheight))); + } + if let Some(ref font_bbox) = prepared.original.font_descriptor_properties.font_bbox { + font_descriptor_vec.push(( "FontBBox", Array(font_bbox.clone()))); + } + if let Some(ref cid_set) = prepared.original.font_descriptor_properties.cid_set { + font_descriptor_vec.push(( + "CIDSet", + Reference(doc.add_object(LoStream::new(lopdf::Dictionary::new(), cid_set.clone()))) + )); + } + + if sub_type == "Type1" { + font_vec.push(( + "FontDescriptor", + Reference( + doc.add_object(LoDictionary::from_iter(font_descriptor_vec)), + ), + )); + } else if sub_type == "Type0" { + if let Some(ref descendant_fonts) = prepared.original.font_properties.descendant_fonts { + if !descendant_fonts.is_empty() { + let mut descendant_fonts_vec = vec![ + ("Type", Name("Font".into())), + ]; + if let Some(ref base_font) = descendant_fonts[0].base_font { + descendant_fonts_vec.push(( "BaseFont", Name(base_font.clone().into_bytes()))); + } + if let Some(ref subtype) = descendant_fonts[0].subtype { + descendant_fonts_vec.push(( "Subtype", Name(subtype.clone().into_bytes()))); + } + if let Some(ref cid_to_gid_map) = descendant_fonts[0].cid_to_gid_map { + descendant_fonts_vec.push(( "CIDToGIDMap", Name(cid_to_gid_map.clone().into_bytes()))); + } + if let Some(ref dw) = descendant_fonts[0].dw { + descendant_fonts_vec.push(( "DW", Integer(*dw))); + } + + if let Some(ref cid_system_info) = descendant_fonts[0].cid_system_info { + let mut cid_system_info_vec = vec![]; + if let Some(ref ordering) = cid_system_info.ordering { + cid_system_info_vec.push(( "Ordering", LoString(ordering.clone().into_bytes(), Literal))); + } + if let Some(ref registry) = cid_system_info.registry { + cid_system_info_vec.push(( "Registry", LoString(registry.clone().into_bytes(), Literal))); + } + if let Some(ref supplement) = cid_system_info.supplement { + cid_system_info_vec.push(( "DW", Integer(*supplement))); + } + descendant_fonts_vec.push(( + "CIDSystemInfo", + Reference( + doc.add_object(LoDictionary::from_iter(cid_system_info_vec)), + ), + )); + } + + descendant_fonts_vec.push(( + "FontDescriptor", + Reference( + doc.add_object(LoDictionary::from_iter(font_descriptor_vec)), + ), + )); + font_vec.push(( + "DescendantFonts", + Array(vec![Reference( + doc.add_object(LoDictionary::from_iter(descendant_fonts_vec)), + )]), + )); + } + } + } + + LoDictionary::from_iter(font_vec) +} + fn docinfo_to_dict(m: &PdfDocumentInfo) -> LoDictionary { let trapping = if m.trapped { "True" } else { "False" }; let gts_pdfx_version = m.conformance.get_identifier_string(); @@ -1292,7 +1589,7 @@ fn docinfo_to_dict(m: &PdfDocumentInfo) -> LoDictionary { let creation_date = LoString(info_create_date.into_bytes(), Literal); let identifier = LoString(m.identifier.as_bytes().to_vec(), Literal); - LoDictionary::from_iter(vec![ + let mut dict_vec = vec![ ("Trapped", trapping.into()), ("CreationDate", creation_date), ("ModDate", LoString(info_mod_date.into_bytes(), Literal)), @@ -1300,14 +1597,27 @@ fn docinfo_to_dict(m: &PdfDocumentInfo) -> LoDictionary { "GTS_PDFXVersion", LoString(gts_pdfx_version.into(), Literal), ), - ("Title", encode_text_to_utf16be(&m.document_title)), - ("Author", encode_text_to_utf16be(&m.author)), - ("Creator", encode_text_to_utf16be(&m.creator)), - ("Producer", encode_text_to_utf16be(&m.producer)), - ("Subject", encode_text_to_utf16be(&m.subject)), ("Identifier", identifier), - ("Keywords", encode_text_to_utf16be(&m.keywords.join(","))), - ]) + ]; + if !m.document_title.is_empty() { + dict_vec.push(("Title", encode_text_to_utf16be(&m.document_title))); + } + if !m.author.is_empty() { + dict_vec.push(("Author", encode_text_to_utf16be(&m.author))); + } + if !m.creator.is_empty() { + dict_vec.push(("Creator", encode_text_to_utf16be(&m.creator))); + } + if !m.producer.is_empty() { + dict_vec.push(("Producer", encode_text_to_utf16be(&m.producer))); + } + if !m.subject.is_empty() { + dict_vec.push(("Subject", encode_text_to_utf16be(&m.subject))); + } + if !m.keywords.is_empty() { + dict_vec.push(("Keywords", encode_text_to_utf16be(&m.keywords.join(",")))); + } + LoDictionary::from_iter(dict_vec) } fn icc_to_stream(val: &IccProfile) -> LoStream { diff --git a/src/svg.rs b/src/svg.rs index db2e4472..d04b0b77 100644 --- a/src/svg.rs +++ b/src/svg.rs @@ -63,6 +63,7 @@ impl Svg { let stream = crate::serialize::translate_operations( &page.ops, &crate::serialize::prepare_fonts(&PdfResources::default(), &[], warnings), + &crate::serialize::prepare_subsetfonts(&PdfResources::default(), &[], warnings), &BTreeMap::new(), true, warnings, diff --git a/src/text.rs b/src/text.rs index 3c03d8ac..d983c61d 100644 --- a/src/text.rs +++ b/src/text.rs @@ -59,6 +59,7 @@ impl From for TextItem { /// (In a full implementation, this would use the actual mapping defined in the PDF.) pub trait CMap { fn map_bytes(&self, bytes: &[u8]) -> String; + fn map_bytes_u16be(&self, bytes: &[u8]) -> String; } /// Decode a PDF string (literal or hexadecimal) into a Rust UTF‑8 String. @@ -71,6 +72,9 @@ pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String // Here you should process escape sequences (\, \(, \), octal codes, etc.). // For simplicity, we assume the provided bytes are already unescaped. if let Some(cmap) = to_unicode { + // Literal strings are using single byte characters for simple fonts + // and single or multiple byte characters for composite fonts. + // Note: composite font mapping is not implemented yet (see https://www.verypdf.com/document/pdf-format-reference/pg_0470.htm) cmap.map_bytes(bytes) } else { String::from_utf8_lossy(bytes).into_owned() @@ -90,8 +94,9 @@ pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String String::from_utf16(&utf16_iter.collect::>()).unwrap_or_default() } else { // Without BOM, use the ToUnicode mapping if available, or fallback. + // hex strings use double byte characters if let Some(cmap) = to_unicode { - cmap.map_bytes(bytes) + cmap.map_bytes_u16be(bytes) } else { String::from_utf8_lossy(bytes).into_owned() }