001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.imaging.formats.jpeg.iptc; 019 020import java.io.ByteArrayInputStream; 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.nio.ByteOrder; 025import java.nio.charset.Charset; 026import java.nio.charset.StandardCharsets; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Comparator; 030import java.util.List; 031import java.util.Objects; 032import java.util.logging.Level; 033import java.util.logging.Logger; 034 035import org.apache.commons.imaging.ImagingConstants; 036import org.apache.commons.imaging.ImagingException; 037import org.apache.commons.imaging.ImagingParameters; 038import org.apache.commons.imaging.common.AbstractBinaryOutputStream; 039import org.apache.commons.imaging.common.Allocator; 040import org.apache.commons.imaging.common.BinaryFileParser; 041import org.apache.commons.imaging.common.BinaryFunctions; 042import org.apache.commons.imaging.common.ByteConversions; 043import org.apache.commons.imaging.formats.jpeg.JpegConstants; 044import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters; 045import org.apache.commons.imaging.internal.Debug; 046 047public class IptcParser extends BinaryFileParser { 048 049 private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName()); 050 051 private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN; 052 053 /** 054 * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata. 055 * 056 * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a> 057 * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a> 058 * @since 1.0-alpha2 059 */ 060 private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087); 061 062 private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; 063 private static final int ENV_TAG_CODED_CHARACTER_SET = 90; 064 private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' }; 065 066 /** 067 * Constructs a new instance with the default, big-endian, byte order. 068 */ 069 public IptcParser() { 070 // empty 071 } 072 073 private Charset findCharset(final byte[] codedCharset) { 074 final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1); 075 try { 076 if (Charset.isSupported(codedCharsetString)) { 077 return Charset.forName(codedCharsetString); 078 } 079 } catch (final IllegalArgumentException ignored) { 080 // ignored 081 } 082 // check if encoding is a escape sequence 083 // normalize encoding byte sequence 084 final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length); 085 int j = 0; 086 for (final byte element : codedCharset) { 087 if (element != ' ') { 088 codedCharsetNormalized[j++] = element; 089 } 090 } 091 092 if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) { 093 return StandardCharsets.UTF_8; 094 } 095 return DEFAULT_CHARSET; 096 } 097 098 public boolean isPhotoshopJpegSegment(final byte[] segmentData) { 099 if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.isStartOf(segmentData)) { 100 return false; 101 } 102 103 final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(); 104 return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM; 105 } 106 107 protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException { 108 final List<IptcBlock> blocks = new ArrayList<>(); 109 110 try (InputStream bis = new ByteArrayInputStream(bytes)) { 111 112 // Note that these are unsigned quantities. Name is always an even 113 // number of bytes (including the 1st byte, which is the size.) 114 115 final byte[] idString = BinaryFunctions.readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(), 116 "App13 Segment missing identification string"); 117 if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) { 118 throw new ImagingException("Not a Photoshop App13 Segment"); 119 } 120 121 // int index = PHOTOSHOP_IDENTIFICATION_STRING.length; 122 123 while (true) { 124 final int imageResourceBlockSignature; 125 try { 126 imageResourceBlockSignature = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER); 127 } catch (final IOException ioEx) { 128 break; 129 } 130 if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) { 131 throw new ImagingException("Invalid Image Resource Block Signature"); 132 } 133 134 final int blockType = BinaryFunctions.read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER); 135 Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 136 137 // skip blocks that the photoshop spec recommends to, see IMAGING-246 138 if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) { 139 Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")"); 140 // if there is still data in this block, before the next image resource block 141 // (8BIM), then we must consume these bytes to leave a pointer ready to read 142 // the next block 143 BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis); 144 continue; 145 } 146 147 final int blockNameLength = BinaryFunctions.readByte("Name length", bis, "Image Resource Block missing name length"); 148 if (blockNameLength > 0) { 149 Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")"); 150 } 151 final byte[] blockNameBytes; 152 if (blockNameLength == 0) { 153 BinaryFunctions.readByte("Block name bytes", bis, "Image Resource Block has invalid name"); 154 blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY; 155 } else { 156 try { 157 blockNameBytes = BinaryFunctions.readBytes("", bis, blockNameLength, "Invalid Image Resource Block name"); 158 } catch (final IOException ioEx) { 159 if (strict) { 160 throw ioEx; 161 } 162 break; 163 } 164 165 if (blockNameLength % 2 == 0) { 166 BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 167 } 168 } 169 170 final int blockSize = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER); 171 Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")"); 172 173 /* 174 * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors 175 */ 176 if (blockSize > bytes.length) { 177 throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length); 178 } 179 180 final byte[] blockData; 181 try { 182 blockData = BinaryFunctions.readBytes("", bis, blockSize, "Invalid Image Resource Block data"); 183 } catch (final IOException ioEx) { 184 if (strict) { 185 throw ioEx; 186 } 187 break; 188 } 189 190 blocks.add(new IptcBlock(blockType, blockNameBytes, blockData)); 191 192 if (blockSize % 2 != 0) { 193 BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte"); 194 } 195 } 196 197 return blocks; 198 } 199 } 200 201 protected List<IptcRecord> parseIptcBlock(final byte[] bytes) { 202 Charset charset = DEFAULT_CHARSET; 203 final List<IptcRecord> elements = new ArrayList<>(); 204 205 int index = 0; 206 // Integer recordVersion = null; 207 while (index + 1 < bytes.length) { 208 final int tagMarker = 0xff & bytes[index++]; 209 Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")"); 210 211 if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) { 212 if (LOGGER.isLoggable(Level.FINE)) { 213 LOGGER.fine("Unexpected record tag marker in IPTC data."); 214 } 215 return elements; 216 } 217 218 final int recordNumber = 0xff & bytes[index++]; 219 Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")"); 220 221 // int recordPrefix = convertByteArrayToShort("recordPrefix", index, 222 // bytes); 223 // if (verbose) 224 // Debug.debug("recordPrefix", recordPrefix + " (0x" 225 // + Integer.toHexString(recordPrefix) + ")"); 226 // index += 2; 227 // 228 // if (recordPrefix != IPTC_RECORD_PREFIX) 229 // { 230 // if (verbose) 231 // System.out 232 // .println("Unexpected record prefix in IPTC data!"); 233 // return elements; 234 // } 235 236 // throw new ImageReadException( 237 // "Unexpected record prefix in IPTC data."); 238 239 final int recordType = 0xff & bytes[index]; 240 Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")"); 241 index++; 242 243 final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder()); 244 index += 2; 245 246 final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE; 247 final int dataFieldCountLength = recordSize & 0x7fff; 248 if (extendedDataset) { 249 Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength); 250 } 251 if (extendedDataset) { 252 // ignore extended dataset and everything after. 253 return elements; 254 } 255 256 final byte[] recordData = BinaryFunctions.copyOfRange(bytes, index, recordSize); 257 index += recordSize; 258 259 // Debug.debug("recordSize", recordSize + " (0x" 260 // + Integer.toHexString(recordSize) + ")"); 261 262 if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) { 263 charset = findCharset(recordData); 264 continue; 265 } 266 267 if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) { 268 continue; 269 } 270 271 if (recordType == 0) { 272 if (LOGGER.isLoggable(Level.FINE)) { 273 LOGGER.fine("ignore record version record! " + elements.size()); 274 } 275 // ignore "record version" record; 276 continue; 277 } 278 // if (recordVersion == null) 279 // { 280 // // The first record in a JPEG/Photoshop IPTC block must be 281 // // the record version. 282 // if (recordType != 0) 283 // throw new ImageReadException("Missing record version: " 284 // + recordType); 285 // recordVersion = new Integer(convertByteArrayToShort( 286 // "recordNumber", recordData)); 287 // 288 // if (recordSize != 2) 289 // throw new ImageReadException( 290 // "Invalid record version record size: " + recordSize); 291 // 292 // // JPEG/Photoshop IPTC metadata is always in Record version 293 // // 2 294 // if (recordVersion.intValue() != 2) 295 // throw new ImageReadException( 296 // "Invalid IPTC record version: " + recordVersion); 297 // 298 // // Debug.debug("recordVersion", recordVersion); 299 // continue; 300 // } 301 302 final String value = new String(recordData, charset); 303 304 final IptcType iptcType = IptcTypeLookup.getIptcType(recordType); 305 306 // Debug.debug("iptcType", iptcType); 307 // debugByteArray("iptcData", iptcData); 308 // Debug.debug(); 309 310 // if (recordType == IPTC_TYPE_CREDIT.type 311 // || recordType == IPTC_TYPE_OBJECT_NAME.type) 312 // { 313 // this.debugByteArray("recordData", recordData); 314 // Debug.debug("index", IPTC_TYPE_CREDIT.name); 315 // } 316 317 final IptcRecord element = new IptcRecord(iptcType, value); 318 elements.add(element); 319 } 320 321 return elements; 322 } 323 324 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException { 325 final List<IptcRecord> records = new ArrayList<>(); 326 327 final List<IptcBlock> blocks = parseAllBlocks(bytes, strict); 328 329 for (final IptcBlock block : blocks) { 330 // Ignore everything but IPTC data. 331 if (!block.isIptcBlock()) { 332 continue; 333 } 334 335 records.addAll(parseIptcBlock(block.getBlockData())); 336 } 337 338 return new PhotoshopApp13Data(records, blocks); 339 } 340 341 // private void writeIPTCRecord(BinaryOutputStream bos, ) 342 343 /* 344 * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as 345 * Photoshop/IPTC segments. 346 * 347 * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string. 348 * 349 * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks"). 350 * 351 * Each block has the following structure: 352 * 353 * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka. 354 * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This 355 * is also padded to have an even length. 356 * 357 * The block data consists of a 0-N records. A record has the following structure: 358 * 359 * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size 360 * (in bytes). 4. Record data, "record size" bytes long. 361 * 362 * Record data (unlike block data) is NOT padded to have an even length. 363 * 364 * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case. 365 * 366 * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02. 367 * 368 * Some IPTC blocks are missing this first "record version" record, so we don't require it. 369 */ 370 public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params) 371 throws ImagingException, IOException { 372 final boolean strict = params != null && params.isStrict(); 373 374 return parsePhotoshopSegment(bytes, strict); 375 } 376 377 public byte[] writeIptcBlock(final List<IptcRecord> elements) throws ImagingException, IOException { 378 return writeIptcBlock(elements, false); 379 } 380 381 public byte[] writeIptcBlock(List<IptcRecord> elements, final boolean forceUtf8Encoding) throws ImagingException, IOException { 382 Charset charset; 383 if (forceUtf8Encoding) { 384 // Using UTF-8 is forced 385 charset = StandardCharsets.UTF_8; 386 } else { 387 // Check if all values can be converted to bytes with DEFAULT_CHARSET, 388 // otherwise use UTF-8 389 charset = DEFAULT_CHARSET; 390 for (final IptcRecord element : elements) { 391 final byte[] recordData = element.getValue().getBytes(charset); 392 if (!new String(recordData, charset).equals(element.getValue())) { 393 charset = StandardCharsets.UTF_8; 394 break; 395 } 396 } 397 } 398 final ByteArrayOutputStream baos = new ByteArrayOutputStream(); 399 try (AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.create(baos, getByteOrder())) { 400 if (!charset.equals(DEFAULT_CHARSET)) { 401 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 402 bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER); 403 bos.write(ENV_TAG_CODED_CHARACTER_SET); 404 final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE; 405 bos.write2Bytes(codedCharset.length); 406 bos.write(codedCharset); 407 } 408 409 // first, right record version record 410 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 411 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 412 bos.write(IptcTypes.RECORD_VERSION.type); // record version record 413 // type. 414 bos.write2Bytes(2); // record version record size 415 bos.write2Bytes(2); // record version value 416 417 // make a copy of the list. 418 elements = new ArrayList<>(elements); 419 420 // sort the list. Records must be in numerical order. 421 final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType(); 422 elements.sort(comparator); 423 // TODO: make sure order right 424 425 // write the list. 426 for (final IptcRecord element : elements) { 427 if (element.iptcType == IptcTypes.RECORD_VERSION) { 428 continue; // ignore 429 } 430 431 bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER); 432 bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER); 433 if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) { 434 throw new ImagingException("Invalid record type: " + element.iptcType.getType()); 435 } 436 bos.write(element.iptcType.getType()); 437 438 final byte[] recordData = element.getValue().getBytes(charset); 439 /* 440 * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " + 441 * charset.name()); } 442 */ 443 444 bos.write2Bytes(recordData.length); 445 bos.write(recordData); 446 } 447 } 448 449 return baos.toByteArray(); 450 } 451 452 public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException { 453 try (ByteArrayOutputStream os = new ByteArrayOutputStream(); 454 AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.bigEndian(os)) { 455 456 JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos); 457 458 final List<IptcBlock> blocks = data.getRawBlocks(); 459 for (final IptcBlock block : blocks) { 460 bos.write4Bytes(JpegConstants.CONST_8BIM); 461 462 if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) { 463 throw new ImagingException("Invalid IPTC block type."); 464 } 465 bos.write2Bytes(block.getBlockType()); 466 467 final byte[] blockNameBytes = block.getBlockNameBytes(); 468 if (blockNameBytes.length > 255) { 469 throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length); 470 } 471 bos.write(blockNameBytes.length); 472 bos.write(blockNameBytes); 473 if (blockNameBytes.length % 2 == 0) { 474 bos.write(0); // pad to even size, including length byte. 475 } 476 477 final byte[] blockData = block.getBlockData(); 478 if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) { 479 throw new ImagingException("IPTC block data is too long: " + blockData.length); 480 } 481 bos.write4Bytes(blockData.length); 482 bos.write(blockData); 483 if (blockData.length % 2 == 1) { 484 bos.write(0); // pad to even size 485 } 486 } 487 488 bos.flush(); 489 return os.toByteArray(); 490 } 491 } 492 493}