001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.imaging.formats.jpeg.iptc;
019
020import java.io.ByteArrayInputStream;
021import java.io.ByteArrayOutputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import java.nio.ByteOrder;
025import java.nio.charset.Charset;
026import java.nio.charset.StandardCharsets;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Comparator;
030import java.util.List;
031import java.util.Objects;
032import java.util.logging.Level;
033import java.util.logging.Logger;
034
035import org.apache.commons.imaging.ImagingConstants;
036import org.apache.commons.imaging.ImagingException;
037import org.apache.commons.imaging.ImagingParameters;
038import org.apache.commons.imaging.common.AbstractBinaryOutputStream;
039import org.apache.commons.imaging.common.Allocator;
040import org.apache.commons.imaging.common.BinaryFileParser;
041import org.apache.commons.imaging.common.BinaryFunctions;
042import org.apache.commons.imaging.common.ByteConversions;
043import org.apache.commons.imaging.formats.jpeg.JpegConstants;
044import org.apache.commons.imaging.formats.jpeg.JpegImagingParameters;
045import org.apache.commons.imaging.internal.Debug;
046
047public class IptcParser extends BinaryFileParser {
048
049    private static final Logger LOGGER = Logger.getLogger(IptcParser.class.getName());
050
051    private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN;
052
053    /**
054     * Block types (or Image Resource IDs) that are not recommended to be interpreted when libraries process Photoshop IPTC metadata.
055     *
056     * @see <a href="https://www.adobe.com/devnet-apps/photoshop/fileformatashtml/">Adobe Photoshop File Formats Specification</a>
057     * @see <a href="https://issues.apache.org/jira/browse/IMAGING-246">IMAGING-246</a>
058     * @since 1.0-alpha2
059     */
060    private static final List<Integer> PHOTOSHOP_IGNORED_BLOCK_TYPE = Arrays.asList(1084, 1085, 1086, 1087);
061
062    private static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
063    private static final int ENV_TAG_CODED_CHARACTER_SET = 90;
064    private static final byte[] CHARACTER_ESCAPE_SEQUENCE = { '\u001B', '%', 'G' };
065
066    /**
067     * Constructs a new instance with the default, big-endian, byte order.
068     */
069    public IptcParser() {
070        // empty
071    }
072
073    private Charset findCharset(final byte[] codedCharset) {
074        final String codedCharsetString = new String(codedCharset, StandardCharsets.ISO_8859_1);
075        try {
076            if (Charset.isSupported(codedCharsetString)) {
077                return Charset.forName(codedCharsetString);
078            }
079        } catch (final IllegalArgumentException ignored) {
080            // ignored
081        }
082        // check if encoding is a escape sequence
083        // normalize encoding byte sequence
084        final byte[] codedCharsetNormalized = Allocator.byteArray(codedCharset.length);
085        int j = 0;
086        for (final byte element : codedCharset) {
087            if (element != ' ') {
088                codedCharsetNormalized[j++] = element;
089            }
090        }
091
092        if (Objects.deepEquals(codedCharsetNormalized, CHARACTER_ESCAPE_SEQUENCE)) {
093            return StandardCharsets.UTF_8;
094        }
095        return DEFAULT_CHARSET;
096    }
097
098    public boolean isPhotoshopJpegSegment(final byte[] segmentData) {
099        if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.isStartOf(segmentData)) {
100            return false;
101        }
102
103        final int index = JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size();
104        return index + 4 <= segmentData.length && ByteConversions.toInt(segmentData, index, APP13_BYTE_ORDER) == JpegConstants.CONST_8BIM;
105    }
106
107    protected List<IptcBlock> parseAllBlocks(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
108        final List<IptcBlock> blocks = new ArrayList<>();
109
110        try (InputStream bis = new ByteArrayInputStream(bytes)) {
111
112            // Note that these are unsigned quantities. Name is always an even
113            // number of bytes (including the 1st byte, which is the size.)
114
115            final byte[] idString = BinaryFunctions.readBytes("", bis, JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.size(),
116                    "App13 Segment missing identification string");
117            if (!JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.equals(idString)) {
118                throw new ImagingException("Not a Photoshop App13 Segment");
119            }
120
121            // int index = PHOTOSHOP_IDENTIFICATION_STRING.length;
122
123            while (true) {
124                final int imageResourceBlockSignature;
125                try {
126                    imageResourceBlockSignature = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing identification string", APP13_BYTE_ORDER);
127                } catch (final IOException ioEx) {
128                    break;
129                }
130                if (imageResourceBlockSignature != JpegConstants.CONST_8BIM) {
131                    throw new ImagingException("Invalid Image Resource Block Signature");
132                }
133
134                final int blockType = BinaryFunctions.read2Bytes("", bis, "Image Resource Block missing type", APP13_BYTE_ORDER);
135                Debug.debug("blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
136
137                // skip blocks that the photoshop spec recommends to, see IMAGING-246
138                if (PHOTOSHOP_IGNORED_BLOCK_TYPE.contains(blockType)) {
139                    Debug.debug("Skipping blockType: " + blockType + " (0x" + Integer.toHexString(blockType) + ")");
140                    // if there is still data in this block, before the next image resource block
141                    // (8BIM), then we must consume these bytes to leave a pointer ready to read
142                    // the next block
143                    BinaryFunctions.searchQuad(JpegConstants.CONST_8BIM, bis);
144                    continue;
145                }
146
147                final int blockNameLength = BinaryFunctions.readByte("Name length", bis, "Image Resource Block missing name length");
148                if (blockNameLength > 0) {
149                    Debug.debug("blockNameLength: " + blockNameLength + " (0x" + Integer.toHexString(blockNameLength) + ")");
150                }
151                final byte[] blockNameBytes;
152                if (blockNameLength == 0) {
153                    BinaryFunctions.readByte("Block name bytes", bis, "Image Resource Block has invalid name");
154                    blockNameBytes = ImagingConstants.EMPTY_BYTE_ARRAY;
155                } else {
156                    try {
157                        blockNameBytes = BinaryFunctions.readBytes("", bis, blockNameLength, "Invalid Image Resource Block name");
158                    } catch (final IOException ioEx) {
159                        if (strict) {
160                            throw ioEx;
161                        }
162                        break;
163                    }
164
165                    if (blockNameLength % 2 == 0) {
166                        BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte");
167                    }
168                }
169
170                final int blockSize = BinaryFunctions.read4Bytes("", bis, "Image Resource Block missing size", APP13_BYTE_ORDER);
171                Debug.debug("blockSize: " + blockSize + " (0x" + Integer.toHexString(blockSize) + ")");
172
173                /*
174                 * doesn't catch cases where blocksize is invalid but is still less than bytes.length but will at least prevent OutOfMemory errors
175                 */
176                if (blockSize > bytes.length) {
177                    throw new ImagingException("Invalid Block Size : " + blockSize + " > " + bytes.length);
178                }
179
180                final byte[] blockData;
181                try {
182                    blockData = BinaryFunctions.readBytes("", bis, blockSize, "Invalid Image Resource Block data");
183                } catch (final IOException ioEx) {
184                    if (strict) {
185                        throw ioEx;
186                    }
187                    break;
188                }
189
190                blocks.add(new IptcBlock(blockType, blockNameBytes, blockData));
191
192                if (blockSize % 2 != 0) {
193                    BinaryFunctions.readByte("Padding byte", bis, "Image Resource Block missing padding byte");
194                }
195            }
196
197            return blocks;
198        }
199    }
200
201    protected List<IptcRecord> parseIptcBlock(final byte[] bytes) {
202        Charset charset = DEFAULT_CHARSET;
203        final List<IptcRecord> elements = new ArrayList<>();
204
205        int index = 0;
206        // Integer recordVersion = null;
207        while (index + 1 < bytes.length) {
208            final int tagMarker = 0xff & bytes[index++];
209            Debug.debug("tagMarker: " + tagMarker + " (0x" + Integer.toHexString(tagMarker) + ")");
210
211            if (tagMarker != IptcConstants.IPTC_RECORD_TAG_MARKER) {
212                if (LOGGER.isLoggable(Level.FINE)) {
213                    LOGGER.fine("Unexpected record tag marker in IPTC data.");
214                }
215                return elements;
216            }
217
218            final int recordNumber = 0xff & bytes[index++];
219            Debug.debug("recordNumber: " + recordNumber + " (0x" + Integer.toHexString(recordNumber) + ")");
220
221            // int recordPrefix = convertByteArrayToShort("recordPrefix", index,
222            // bytes);
223            // if (verbose)
224            // Debug.debug("recordPrefix", recordPrefix + " (0x"
225            // + Integer.toHexString(recordPrefix) + ")");
226            // index += 2;
227            //
228            // if (recordPrefix != IPTC_RECORD_PREFIX)
229            // {
230            // if (verbose)
231            // System.out
232            // .println("Unexpected record prefix in IPTC data!");
233            // return elements;
234            // }
235
236            // throw new ImageReadException(
237            // "Unexpected record prefix in IPTC data.");
238
239            final int recordType = 0xff & bytes[index];
240            Debug.debug("recordType: " + recordType + " (0x" + Integer.toHexString(recordType) + ")");
241            index++;
242
243            final int recordSize = ByteConversions.toUInt16(bytes, index, getByteOrder());
244            index += 2;
245
246            final boolean extendedDataset = recordSize > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE;
247            final int dataFieldCountLength = recordSize & 0x7fff;
248            if (extendedDataset) {
249                Debug.debug("extendedDataset. dataFieldCountLength: " + dataFieldCountLength);
250            }
251            if (extendedDataset) {
252                // ignore extended dataset and everything after.
253                return elements;
254            }
255
256            final byte[] recordData = BinaryFunctions.copyOfRange(bytes, index, recordSize);
257            index += recordSize;
258
259            // Debug.debug("recordSize", recordSize + " (0x"
260            // + Integer.toHexString(recordSize) + ")");
261
262            if (recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET) {
263                charset = findCharset(recordData);
264                continue;
265            }
266
267            if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) {
268                continue;
269            }
270
271            if (recordType == 0) {
272                if (LOGGER.isLoggable(Level.FINE)) {
273                    LOGGER.fine("ignore record version record! " + elements.size());
274                }
275                // ignore "record version" record;
276                continue;
277            }
278            // if (recordVersion == null)
279            // {
280            // // The first record in a JPEG/Photoshop IPTC block must be
281            // // the record version.
282            // if (recordType != 0)
283            // throw new ImageReadException("Missing record version: "
284            // + recordType);
285            // recordVersion = new Integer(convertByteArrayToShort(
286            // "recordNumber", recordData));
287            //
288            // if (recordSize != 2)
289            // throw new ImageReadException(
290            // "Invalid record version record size: " + recordSize);
291            //
292            // // JPEG/Photoshop IPTC metadata is always in Record version
293            // // 2
294            // if (recordVersion.intValue() != 2)
295            // throw new ImageReadException(
296            // "Invalid IPTC record version: " + recordVersion);
297            //
298            // // Debug.debug("recordVersion", recordVersion);
299            // continue;
300            // }
301
302            final String value = new String(recordData, charset);
303
304            final IptcType iptcType = IptcTypeLookup.getIptcType(recordType);
305
306            // Debug.debug("iptcType", iptcType);
307            // debugByteArray("iptcData", iptcData);
308            // Debug.debug();
309
310            // if (recordType == IPTC_TYPE_CREDIT.type
311            // || recordType == IPTC_TYPE_OBJECT_NAME.type)
312            // {
313            // this.debugByteArray("recordData", recordData);
314            // Debug.debug("index", IPTC_TYPE_CREDIT.name);
315            // }
316
317            final IptcRecord element = new IptcRecord(iptcType, value);
318            elements.add(element);
319        }
320
321        return elements;
322    }
323
324    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final boolean strict) throws ImagingException, IOException {
325        final List<IptcRecord> records = new ArrayList<>();
326
327        final List<IptcBlock> blocks = parseAllBlocks(bytes, strict);
328
329        for (final IptcBlock block : blocks) {
330            // Ignore everything but IPTC data.
331            if (!block.isIptcBlock()) {
332                continue;
333            }
334
335            records.addAll(parseIptcBlock(block.getBlockData()));
336        }
337
338        return new PhotoshopApp13Data(records, blocks);
339    }
340
341    // private void writeIPTCRecord(BinaryOutputStream bos, )
342
343    /*
344     * In practice, App13 segments are only used for Photoshop/IPTC metadata. However, we should not treat App13 signatures without Photoshop's signature as
345     * Photoshop/IPTC segments.
346     *
347     * A Photoshop/IPTC App13 segment begins with the Photoshop Identification string.
348     *
349     * There follows 0-N blocks (Photoshop calls them "Image Resource Blocks").
350     *
351     * Each block has the following structure:
352     *
353     * 1. 4-byte type. This is always "8BIM" for blocks in a Photoshop App13 segment. 2. 2-byte id. IPTC data is stored in blocks with id 0x0404, aka.
354     * IPTC_NAA_RECORD_IMAGE_RESOURCE_ID 3. Block name as a Pascal String. This is padded to have an even length. 4. 4-byte size (in bytes). 5. Block data. This
355     * is also padded to have an even length.
356     *
357     * The block data consists of a 0-N records. A record has the following structure:
358     *
359     * 1. 2-byte prefix. The value is always 0x1C02 2. 1-byte record type. The record types are documented by the IPTC. See IptcConstants. 3. 2-byte record size
360     * (in bytes). 4. Record data, "record size" bytes long.
361     *
362     * Record data (unlike block data) is NOT padded to have an even length.
363     *
364     * Record data, for IPTC record, should always be ISO-8859-1. But according to SANSELAN-33, this isn't always the case.
365     *
366     * The exception is the first record in the block, which must always be a record version record, whose value is a two-byte number; the value is 0x02.
367     *
368     * Some IPTC blocks are missing this first "record version" record, so we don't require it.
369     */
370    public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, final ImagingParameters<JpegImagingParameters> params)
371            throws ImagingException, IOException {
372        final boolean strict = params != null && params.isStrict();
373
374        return parsePhotoshopSegment(bytes, strict);
375    }
376
377    public byte[] writeIptcBlock(final List<IptcRecord> elements) throws ImagingException, IOException {
378        return writeIptcBlock(elements, false);
379    }
380
381    public byte[] writeIptcBlock(List<IptcRecord> elements, final boolean forceUtf8Encoding) throws ImagingException, IOException {
382        Charset charset;
383        if (forceUtf8Encoding) {
384            // Using UTF-8 is forced
385            charset = StandardCharsets.UTF_8;
386        } else {
387            // Check if all values can be converted to bytes with DEFAULT_CHARSET,
388            // otherwise use UTF-8
389            charset = DEFAULT_CHARSET;
390            for (final IptcRecord element : elements) {
391                final byte[] recordData = element.getValue().getBytes(charset);
392                if (!new String(recordData, charset).equals(element.getValue())) {
393                    charset = StandardCharsets.UTF_8;
394                    break;
395                }
396            }
397        }
398        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
399        try (AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.create(baos, getByteOrder())) {
400            if (!charset.equals(DEFAULT_CHARSET)) {
401                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
402                bos.write(IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER);
403                bos.write(ENV_TAG_CODED_CHARACTER_SET);
404                final byte[] codedCharset = CHARACTER_ESCAPE_SEQUENCE;
405                bos.write2Bytes(codedCharset.length);
406                bos.write(codedCharset);
407            }
408
409            // first, right record version record
410            bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
411            bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
412            bos.write(IptcTypes.RECORD_VERSION.type); // record version record
413                                                      // type.
414            bos.write2Bytes(2); // record version record size
415            bos.write2Bytes(2); // record version value
416
417            // make a copy of the list.
418            elements = new ArrayList<>(elements);
419
420            // sort the list. Records must be in numerical order.
421            final Comparator<IptcRecord> comparator = (e1, e2) -> e2.iptcType.getType() - e1.iptcType.getType();
422            elements.sort(comparator);
423            // TODO: make sure order right
424
425            // write the list.
426            for (final IptcRecord element : elements) {
427                if (element.iptcType == IptcTypes.RECORD_VERSION) {
428                    continue; // ignore
429                }
430
431                bos.write(IptcConstants.IPTC_RECORD_TAG_MARKER);
432                bos.write(IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER);
433                if (element.iptcType.getType() < 0 || element.iptcType.getType() > 0xff) {
434                    throw new ImagingException("Invalid record type: " + element.iptcType.getType());
435                }
436                bos.write(element.iptcType.getType());
437
438                final byte[] recordData = element.getValue().getBytes(charset);
439                /*
440                 * if (!new String(recordData, charset).equals(element.getValue())) { throw new ImageWriteException( "Invalid record value, not " +
441                 * charset.name()); }
442                 */
443
444                bos.write2Bytes(recordData.length);
445                bos.write(recordData);
446            }
447        }
448
449        return baos.toByteArray();
450    }
451
452    public byte[] writePhotoshopApp13Segment(final PhotoshopApp13Data data) throws IOException, ImagingException {
453        try (ByteArrayOutputStream os = new ByteArrayOutputStream();
454                AbstractBinaryOutputStream bos = AbstractBinaryOutputStream.bigEndian(os)) {
455
456            JpegConstants.PHOTOSHOP_IDENTIFICATION_STRING.writeTo(bos);
457
458            final List<IptcBlock> blocks = data.getRawBlocks();
459            for (final IptcBlock block : blocks) {
460                bos.write4Bytes(JpegConstants.CONST_8BIM);
461
462                if (block.getBlockType() < 0 || block.getBlockType() > 0xffff) {
463                    throw new ImagingException("Invalid IPTC block type.");
464                }
465                bos.write2Bytes(block.getBlockType());
466
467                final byte[] blockNameBytes = block.getBlockNameBytes();
468                if (blockNameBytes.length > 255) {
469                    throw new ImagingException("IPTC block name is too long: " + blockNameBytes.length);
470                }
471                bos.write(blockNameBytes.length);
472                bos.write(blockNameBytes);
473                if (blockNameBytes.length % 2 == 0) {
474                    bos.write(0); // pad to even size, including length byte.
475                }
476
477                final byte[] blockData = block.getBlockData();
478                if (blockData.length > IptcConstants.IPTC_NON_EXTENDED_RECORD_MAXIMUM_SIZE) {
479                    throw new ImagingException("IPTC block data is too long: " + blockData.length);
480                }
481                bos.write4Bytes(blockData.length);
482                bos.write(blockData);
483                if (blockData.length % 2 == 1) {
484                    bos.write(0); // pad to even size
485                }
486            }
487
488            bos.flush();
489            return os.toByteArray();
490        }
491    }
492
493}