001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.file.Files;
031import java.nio.file.Path;
032import java.text.MessageFormat;
033import java.util.Locale;
034import java.util.Objects;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037
038import org.apache.commons.io.ByteOrderMark;
039import org.apache.commons.io.IOUtils;
040
041/**
042 * Character stream that handles all the necessary Voodoo to figure out the
043 * charset encoding of the XML document within the stream.
044 * <p>
045 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader.
046 * This one IS a character stream.
047 * </p>
048 * <p>
049 * All this has to be done without consuming characters from the stream, if not
050 * the XML parser will not recognized the document as a valid XML. This is not
051 * 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers
052 * right now, XmlStreamReader handles it and things work in all parsers).
053 * </p>
054 * <p>
055 * The XmlStreamReader class handles the charset encoding of XML documents in
056 * Files, raw streams and HTTP streams by offering a wide set of constructors.
057 * </p>
058 * <p>
059 * By default the charset encoding detection is lenient, the constructor with
060 * the lenient flag can be used for a script (following HTTP MIME and XML
061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
062 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types">
063 * Determining the character encoding of a feed</a>.
064 * </p>
065 * <p>
066 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under
067 * Apache License 2.0.
068 * </p>
069 *
070 * @see org.apache.commons.io.output.XmlStreamWriter
071 * @since 2.0
072 */
073public class XmlStreamReader extends Reader {
074    private static final String UTF_8 = "UTF-8";
075
076    private static final String US_ASCII = "US-ASCII";
077
078    private static final String UTF_16BE = "UTF-16BE";
079
080    private static final String UTF_16LE = "UTF-16LE";
081
082    private static final String UTF_32BE = "UTF-32BE";
083
084    private static final String UTF_32LE = "UTF-32LE";
085
086    private static final String UTF_16 = "UTF-16";
087
088    private static final String UTF_32 = "UTF-32";
089
090    private static final String EBCDIC = "CP1047";
091
092    private static final ByteOrderMark[] BOMS = {
093        ByteOrderMark.UTF_8,
094        ByteOrderMark.UTF_16BE,
095        ByteOrderMark.UTF_16LE,
096        ByteOrderMark.UTF_32BE,
097        ByteOrderMark.UTF_32LE
098    };
099
100    // UTF_16LE and UTF_32LE have the same two starting BOM bytes.
101    private static final ByteOrderMark[] XML_GUESS_BYTES = {
102        new ByteOrderMark(UTF_8,    0x3C, 0x3F, 0x78, 0x6D),
103        new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F),
104        new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
105        new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C,
106                0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
107        new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00,
108                0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
109        new ByteOrderMark(EBCDIC,   0x4C, 0x6F, 0xA7, 0x94)
110    };
111
112    private static final Pattern CHARSET_PATTERN = Pattern
113            .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
114
115    /**
116     * Pattern capturing the encoding of the "xml" processing instruction.
117     * <p>
118     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>.
119     * </p>
120     */
121    public static final Pattern ENCODING_PATTERN = Pattern.compile(
122    // @formatter:off
123            "^<\\?xml\\s+"
124            + "version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+"
125            + "encoding\\s*=\\s*((?:\"[A-Za-z]([A-Za-z0-9\\._]|-)*\")|(?:'[A-Za-z]([A-Za-z0-9\\\\._]|-)*'))",
126            Pattern.MULTILINE);
127    // @formatter:on
128
129    private static final String RAW_EX_1 =
130        "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
131
132    private static final String RAW_EX_2 =
133        "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
134
135    private static final String HTTP_EX_1 =
136        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL";
137
138    private static final String HTTP_EX_2 =
139        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
140
141    private static final String HTTP_EX_3 =
142        "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME";
143
144    /**
145     * Returns charset parameter value, NULL if not present, NULL if
146     * httpContentType is NULL.
147     *
148     * @param httpContentType the HTTP content type
149     * @return The content type encoding (upcased)
150     */
151    static String getContentTypeEncoding(final String httpContentType) {
152        String encoding = null;
153        if (httpContentType != null) {
154            final int i = httpContentType.indexOf(";");
155            if (i > -1) {
156                final String postMime = httpContentType.substring(i + 1);
157                final Matcher m = CHARSET_PATTERN.matcher(postMime);
158                encoding = m.find() ? m.group(1) : null;
159                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
160            }
161        }
162        return encoding;
163    }
164
165    /**
166     * Returns MIME type or NULL if httpContentType is NULL.
167     *
168     * @param httpContentType the HTTP content type
169     * @return The mime content type
170     */
171    static String getContentTypeMime(final String httpContentType) {
172        String mime = null;
173        if (httpContentType != null) {
174            final int i = httpContentType.indexOf(";");
175            if (i >= 0) {
176                mime = httpContentType.substring(0, i);
177            } else {
178                mime = httpContentType;
179            }
180            mime = mime.trim();
181        }
182        return mime;
183    }
184
185    /**
186     * Returns the encoding declared in the <?xml encoding=...?>, NULL if none.
187     *
188     * @param inputStream InputStream to create the reader from.
189     * @param guessedEnc guessed encoding
190     * @return the encoding declared in the <?xml encoding=...?>
191     * @throws IOException thrown if there is a problem reading the stream.
192     */
193    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc)
194            throws IOException {
195        String encoding = null;
196        if (guessedEnc != null) {
197            final byte[] bytes = IOUtils.byteArray();
198            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
199            int offset = 0;
200            int max = IOUtils.DEFAULT_BUFFER_SIZE;
201            int c = inputStream.read(bytes, offset, max);
202            int firstGT = -1;
203            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
204            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
205                offset += c;
206                max -= c;
207                c = inputStream.read(bytes, offset, max);
208                xmlProlog = new String(bytes, 0, offset, guessedEnc);
209                firstGT = xmlProlog.indexOf('>');
210            }
211            if (firstGT == -1) {
212                if (c == -1) {
213                    throw new IOException("Unexpected end of XML stream");
214                }
215                throw new IOException(
216                        "XML prolog or ROOT element not found on first "
217                                + offset + " bytes");
218            }
219            final int bytesRead = offset;
220            if (bytesRead > 0) {
221                inputStream.reset();
222                final BufferedReader bReader = new BufferedReader(new StringReader(
223                        xmlProlog.substring(0, firstGT + 1)));
224                final StringBuffer prolog = new StringBuffer();
225                String line;
226                while ((line = bReader.readLine()) != null) {
227                    prolog.append(line);
228                }
229                final Matcher m = ENCODING_PATTERN.matcher(prolog);
230                if (m.find()) {
231                    encoding = m.group(1).toUpperCase(Locale.ROOT);
232                    encoding = encoding.substring(1, encoding.length() - 1);
233                }
234            }
235        }
236        return encoding;
237    }
238
239    /**
240     * Indicates if the MIME type belongs to the APPLICATION XML family.
241     *
242     * @param mime The mime type
243     * @return true if the mime type belongs to the APPLICATION XML family,
244     * otherwise false
245     */
246    static boolean isAppXml(final String mime) {
247        return mime != null &&
248               (mime.equals("application/xml") ||
249                mime.equals("application/xml-dtd") ||
250                mime.equals("application/xml-external-parsed-entity") ||
251               mime.startsWith("application/") && mime.endsWith("+xml"));
252    }
253
254    /**
255     * Indicates if the MIME type belongs to the TEXT XML family.
256     *
257     * @param mime The mime type
258     * @return true if the mime type belongs to the TEXT XML family,
259     * otherwise false
260     */
261    static boolean isTextXml(final String mime) {
262        return mime != null &&
263              (mime.equals("text/xml") ||
264               mime.equals("text/xml-external-parsed-entity") ||
265              mime.startsWith("text/") && mime.endsWith("+xml"));
266    }
267
268    private final Reader reader;
269
270    private final String encoding;
271
272    private final String defaultEncoding;
273
274    /**
275     * Creates a Reader for a File.
276     * <p>
277     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
278     * if this is also missing defaults to UTF-8.
279     * <p>
280     * It does a lenient charset encoding detection, check the constructor with
281     * the lenient parameter for details.
282     *
283     * @param file File to create a Reader from.
284     * @throws IOException thrown if there is a problem reading the file.
285     */
286    public XmlStreamReader(final File file) throws IOException {
287        this(Objects.requireNonNull(file, "file").toPath());
288    }
289
290    /**
291     * Creates a Reader for a raw InputStream.
292     * <p>
293     * It follows the same logic used for files.
294     * <p>
295     * It does a lenient charset encoding detection, check the constructor with
296     * the lenient parameter for details.
297     *
298     * @param inputStream InputStream to create a Reader from.
299     * @throws IOException thrown if there is a problem reading the stream.
300     */
301    public XmlStreamReader(final InputStream inputStream) throws IOException {
302        this(inputStream, true);
303    }
304
305    /**
306     * Creates a Reader for a raw InputStream.
307     * <p>
308     * It follows the same logic used for files.
309     * <p>
310     * If lenient detection is indicated and the detection above fails as per
311     * specifications it then attempts the following:
312     * <p>
313     * If the content type was 'text/html' it replaces it with 'text/xml' and
314     * tries the detection again.
315     * <p>
316     * Else if the XML prolog had a charset encoding that encoding is used.
317     * <p>
318     * Else if the content type had a charset encoding that encoding is used.
319     * <p>
320     * Else 'UTF-8' is used.
321     * <p>
322     * If lenient detection is indicated an XmlStreamReaderException is never
323     * thrown.
324     *
325     * @param inputStream InputStream to create a Reader from.
326     * @param lenient indicates if the charset encoding detection should be
327     *        relaxed.
328     * @throws IOException thrown if there is a problem reading the stream.
329     * @throws XmlStreamReaderException thrown if the charset encoding could not
330     *         be determined according to the specs.
331     */
332    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
333        this(inputStream, lenient, null);
334    }
335
336    /**
337     * Creates a Reader for a raw InputStream.
338     * <p>
339     * It follows the same logic used for files.
340     * <p>
341     * If lenient detection is indicated and the detection above fails as per
342     * specifications it then attempts the following:
343     * <p>
344     * If the content type was 'text/html' it replaces it with 'text/xml' and
345     * tries the detection again.
346     * <p>
347     * Else if the XML prolog had a charset encoding that encoding is used.
348     * <p>
349     * Else if the content type had a charset encoding that encoding is used.
350     * <p>
351     * Else 'UTF-8' is used.
352     * <p>
353     * If lenient detection is indicated an XmlStreamReaderException is never
354     * thrown.
355     *
356     * @param inputStream InputStream to create a Reader from.
357     * @param lenient indicates if the charset encoding detection should be
358     *        relaxed.
359     * @param defaultEncoding The default encoding
360     * @throws IOException thrown if there is a problem reading the stream.
361     * @throws XmlStreamReaderException thrown if the charset encoding could not
362     *         be determined according to the specs.
363     */
364    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
365    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding)
366            throws IOException {
367        Objects.requireNonNull(inputStream, "inputStream");
368        this.defaultEncoding = defaultEncoding;
369        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS);
370        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
371        this.encoding = doRawStream(bom, pis, lenient);
372        this.reader = new InputStreamReader(pis, encoding);
373    }
374
375    /**
376     * Creates a Reader using an InputStream and the associated content-type
377     * header.
378     * <p>
379     * First it checks if the stream has BOM. If there is not BOM checks the
380     * content-type encoding. If there is not content-type encoding checks the
381     * XML prolog encoding. If there is not XML prolog encoding uses the default
382     * encoding mandated by the content-type MIME type.
383     * <p>
384     * It does a lenient charset encoding detection, check the constructor with
385     * the lenient parameter for details.
386     *
387     * @param inputStream InputStream to create the reader from.
388     * @param httpContentType content-type header to use for the resolution of
389     *        the charset encoding.
390     * @throws IOException thrown if there is a problem reading the file.
391     */
392    public XmlStreamReader(final InputStream inputStream, final String httpContentType)
393            throws IOException {
394        this(inputStream, httpContentType, true);
395    }
396
397    /**
398     * Creates a Reader using an InputStream and the associated content-type
399     * header. This constructor is lenient regarding the encoding detection.
400     * <p>
401     * First it checks if the stream has BOM. If there is not BOM checks the
402     * content-type encoding. If there is not content-type encoding checks the
403     * XML prolog encoding. If there is not XML prolog encoding uses the default
404     * encoding mandated by the content-type MIME type.
405     * <p>
406     * If lenient detection is indicated and the detection above fails as per
407     * specifications it then attempts the following:
408     * <p>
409     * If the content type was 'text/html' it replaces it with 'text/xml' and
410     * tries the detection again.
411     * <p>
412     * Else if the XML prolog had a charset encoding that encoding is used.
413     * <p>
414     * Else if the content type had a charset encoding that encoding is used.
415     * <p>
416     * Else 'UTF-8' is used.
417     * <p>
418     * If lenient detection is indicated an XmlStreamReaderException is never
419     * thrown.
420     *
421     * @param inputStream InputStream to create the reader from.
422     * @param httpContentType content-type header to use for the resolution of
423     *        the charset encoding.
424     * @param lenient indicates if the charset encoding detection should be
425     *        relaxed.
426     * @throws IOException thrown if there is a problem reading the file.
427     * @throws XmlStreamReaderException thrown if the charset encoding could not
428     *         be determined according to the specs.
429     */
430    public XmlStreamReader(final InputStream inputStream, final String httpContentType,
431            final boolean lenient) throws IOException {
432        this(inputStream, httpContentType, lenient, null);
433    }
434
435
436    /**
437     * Creates a Reader using an InputStream and the associated content-type
438     * header. This constructor is lenient regarding the encoding detection.
439     * <p>
440     * First it checks if the stream has BOM. If there is not BOM checks the
441     * content-type encoding. If there is not content-type encoding checks the
442     * XML prolog encoding. If there is not XML prolog encoding uses the default
443     * encoding mandated by the content-type MIME type.
444     * <p>
445     * If lenient detection is indicated and the detection above fails as per
446     * specifications it then attempts the following:
447     * <p>
448     * If the content type was 'text/html' it replaces it with 'text/xml' and
449     * tries the detection again.
450     * <p>
451     * Else if the XML prolog had a charset encoding that encoding is used.
452     * <p>
453     * Else if the content type had a charset encoding that encoding is used.
454     * <p>
455     * Else 'UTF-8' is used.
456     * <p>
457     * If lenient detection is indicated an XmlStreamReaderException is never
458     * thrown.
459     *
460     * @param inputStream InputStream to create the reader from.
461     * @param httpContentType content-type header to use for the resolution of
462     *        the charset encoding.
463     * @param lenient indicates if the charset encoding detection should be
464     *        relaxed.
465     * @param defaultEncoding The default encoding
466     * @throws IOException thrown if there is a problem reading the file.
467     * @throws XmlStreamReaderException thrown if the charset encoding could not
468     *         be determined according to the specs.
469     */
470    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
471    public XmlStreamReader(final InputStream inputStream, final String httpContentType,
472            final boolean lenient, final String defaultEncoding) throws IOException {
473        Objects.requireNonNull(inputStream, "inputStream");
474        this.defaultEncoding = defaultEncoding;
475        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS);
476        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
477        this.encoding = processHttpStream(bom, pis, httpContentType, lenient);
478        this.reader = new InputStreamReader(pis, encoding);
479    }
480
481    /**
482     * Creates a Reader for a File.
483     * <p>
484     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset,
485     * if this is also missing defaults to UTF-8.
486     * <p>
487     * It does a lenient charset encoding detection, check the constructor with
488     * the lenient parameter for details.
489     *
490     * @param file File to create a Reader from.
491     * @throws IOException thrown if there is a problem reading the file.
492     * @since 2.11.0
493     */
494    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
495    public XmlStreamReader(final Path file) throws IOException {
496        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
497    }
498
499    /**
500     * Creates a Reader using the InputStream of a URL.
501     * <p>
502     * If the URL is not of type HTTP and there is not 'content-type' header in
503     * the fetched data it uses the same logic used for Files.
504     * <p>
505     * If the URL is a HTTP Url or there is a 'content-type' header in the
506     * fetched data it uses the same logic used for an InputStream with
507     * content-type.
508     * <p>
509     * It does a lenient charset encoding detection, check the constructor with
510     * the lenient parameter for details.
511     *
512     * @param url URL to create a Reader from.
513     * @throws IOException thrown if there is a problem reading the stream of
514     *         the URL.
515     */
516    public XmlStreamReader(final URL url) throws IOException {
517        this(Objects.requireNonNull(url, "url").openConnection(), null);
518    }
519
520    /**
521     * Creates a Reader using the InputStream of a URLConnection.
522     * <p>
523     * If the URLConnection is not of type HttpURLConnection and there is not
524     * 'content-type' header in the fetched data it uses the same logic used for
525     * files.
526     * <p>
527     * If the URLConnection is a HTTP Url or there is a 'content-type' header in
528     * the fetched data it uses the same logic used for an InputStream with
529     * content-type.
530     * <p>
531     * It does a lenient charset encoding detection, check the constructor with
532     * the lenient parameter for details.
533     *
534     * @param conn URLConnection to create a Reader from.
535     * @param defaultEncoding The default encoding
536     * @throws IOException thrown if there is a problem reading the stream of
537     *         the URLConnection.
538     */
539    public XmlStreamReader(final URLConnection conn, final String defaultEncoding) throws IOException {
540        Objects.requireNonNull(conn, "conn");
541        this.defaultEncoding = defaultEncoding;
542        final boolean lenient = true;
543        final String contentType = conn.getContentType();
544        final InputStream inputStream = conn.getInputStream();
545        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
546        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE), false, BOMS);
547        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
548        if (conn instanceof HttpURLConnection || contentType != null) {
549            this.encoding = processHttpStream(bom, pis, contentType, lenient);
550        } else {
551            this.encoding = doRawStream(bom, pis, lenient);
552        }
553        this.reader = new InputStreamReader(pis, encoding);
554    }
555
556    /**
557     * Calculate the HTTP encoding.
558     *
559     * @param httpContentType The HTTP content type
560     * @param bomEnc BOM encoding
561     * @param xmlGuessEnc XML Guess encoding
562     * @param xmlEnc XML encoding
563     * @param lenient indicates if the charset encoding detection should be
564     *        relaxed.
565     * @return the HTTP encoding
566     * @throws IOException thrown if there is a problem reading the stream.
567     */
568    String calculateHttpEncoding(final String httpContentType,
569            final String bomEnc, final String xmlGuessEnc, final String xmlEnc,
570            final boolean lenient) throws IOException {
571
572        // Lenient and has XML encoding
573        if (lenient && xmlEnc != null) {
574            return xmlEnc;
575        }
576
577        // Determine mime/encoding content types from HTTP Content Type
578        final String cTMime = getContentTypeMime(httpContentType);
579        final String cTEnc  = getContentTypeEncoding(httpContentType);
580        final boolean appXml  = isAppXml(cTMime);
581        final boolean textXml = isTextXml(cTMime);
582
583        // Mime type NOT "application/xml" or "text/xml"
584        if (!appXml && !textXml) {
585            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
586            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
587        }
588
589        // No content type encoding
590        if (cTEnc == null) {
591            if (appXml) {
592                return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
593            }
594            return defaultEncoding == null ? US_ASCII : defaultEncoding;
595        }
596
597        // UTF-16BE or UTF-16LE content type encoding
598        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
599            if (bomEnc != null) {
600                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
601                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
602            }
603            return cTEnc;
604        }
605
606        // UTF-16 content type encoding
607        if (cTEnc.equals(UTF_16)) {
608            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
609                return bomEnc;
610            }
611            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
612            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
613        }
614
615        // UTF-32BE or UTF-132E content type encoding
616        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
617            if (bomEnc != null) {
618                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
619                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
620            }
621            return cTEnc;
622        }
623
624        // UTF-32 content type encoding
625        if (cTEnc.equals(UTF_32)) {
626            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
627                return bomEnc;
628            }
629            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
630            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
631        }
632
633        return cTEnc;
634    }
635
636    /**
637     * Calculate the raw encoding.
638     *
639     * @param bomEnc BOM encoding
640     * @param xmlGuessEnc XML Guess encoding
641     * @param xmlEnc XML encoding
642     * @return the raw encoding
643     * @throws IOException thrown if there is a problem reading the stream.
644     */
645    String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
646            final String xmlEnc) throws IOException {
647
648        // BOM is Null
649        if (bomEnc == null) {
650            if (xmlGuessEnc == null || xmlEnc == null) {
651                return defaultEncoding == null ? UTF_8 : defaultEncoding;
652            }
653            if (xmlEnc.equals(UTF_16) &&
654               (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
655                return xmlGuessEnc;
656            }
657            return xmlEnc;
658        }
659
660        // BOM is UTF-8
661        if (bomEnc.equals(UTF_8)) {
662            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
663                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
664                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
665            }
666            if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
667                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
668                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
669            }
670            return bomEnc;
671        }
672
673        // BOM is UTF-16BE or UTF-16LE
674        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
675            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
676                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
677                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
678            }
679            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
680                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
681                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
682            }
683            return bomEnc;
684        }
685
686        // BOM is UTF-32BE or UTF-32LE
687        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
688            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
689                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
690                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
691            }
692            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
693                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
694                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
695            }
696            return bomEnc;
697        }
698
699        // BOM is something else
700        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
701        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
702    }
703
704    /**
705     * Closes the XmlStreamReader stream.
706     *
707     * @throws IOException thrown if there was a problem closing the stream.
708     */
709    @Override
710    public void close() throws IOException {
711        reader.close();
712    }
713
714    /**
715     * Do lenient detection.
716     *
717     * @param httpContentType content-type header to use for the resolution of
718     *        the charset encoding.
719     * @param ex The thrown exception
720     * @return the encoding
721     * @throws IOException thrown if there is a problem reading the stream.
722     */
723    private String doLenientDetection(String httpContentType,
724            XmlStreamReaderException ex) throws IOException {
725        if (httpContentType != null && httpContentType.startsWith("text/html")) {
726            httpContentType = httpContentType.substring("text/html".length());
727            httpContentType = "text/xml" + httpContentType;
728            try {
729                return calculateHttpEncoding(httpContentType, ex.getBomEncoding(),
730                        ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true);
731            } catch (final XmlStreamReaderException ex2) {
732                ex = ex2;
733            }
734        }
735        String encoding = ex.getXmlEncoding();
736        if (encoding == null) {
737            encoding = ex.getContentTypeEncoding();
738        }
739        if (encoding == null) {
740            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
741        }
742        return encoding;
743    }
744
745    /**
746     * Process the raw stream.
747     *
748     * @param bom BOMInputStream to detect byte order marks
749     * @param pis BOMInputStream to guess XML encoding
750     * @param lenient indicates if the charset encoding detection should be
751     *        relaxed.
752     * @return the encoding to be used
753     * @throws IOException thrown if there is a problem reading the stream.
754     */
755    private String doRawStream(final BOMInputStream bom, final BOMInputStream pis, final boolean lenient)
756            throws IOException {
757        final String bomEnc      = bom.getBOMCharsetName();
758        final String xmlGuessEnc = pis.getBOMCharsetName();
759        final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
760        try {
761            return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
762        } catch (final XmlStreamReaderException ex) {
763            if (lenient) {
764                return doLenientDetection(null, ex);
765            }
766            throw ex;
767        }
768    }
769
770    /**
771     * Returns the default encoding to use if none is set in HTTP content-type,
772     * XML prolog and the rules based on content-type are not adequate.
773     * <p>
774     * If it is NULL the content-type based rules are used.
775     *
776     * @return the default encoding to use.
777     */
778    public String getDefaultEncoding() {
779        return defaultEncoding;
780    }
781
782    /**
783     * Returns the charset encoding of the XmlStreamReader.
784     *
785     * @return charset encoding.
786     */
787    public String getEncoding() {
788        return encoding;
789    }
790
791    /**
792     * Process a HTTP stream.
793     *
794     * @param bom BOMInputStream to detect byte order marks
795     * @param pis BOMInputStream to guess XML encoding
796     * @param httpContentType The HTTP content type
797     * @param lenient indicates if the charset encoding detection should be
798     *        relaxed.
799     * @return the encoding to be used
800     * @throws IOException thrown if there is a problem reading the stream.
801     */
802    private String processHttpStream(final BOMInputStream bom, final BOMInputStream pis, final String httpContentType,
803        final boolean lenient) throws IOException {
804        final String bomEnc = bom.getBOMCharsetName();
805        final String xmlGuessEnc = pis.getBOMCharsetName();
806        final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
807        try {
808            return calculateHttpEncoding(httpContentType, bomEnc, xmlGuessEnc, xmlEnc, lenient);
809        } catch (final XmlStreamReaderException ex) {
810            if (lenient) {
811                return doLenientDetection(httpContentType, ex);
812            }
813            throw ex;
814        }
815    }
816
817    /**
818     * Invokes the underlying reader's {@code read(char[], int, int)} method.
819     * @param buf the buffer to read the characters into
820     * @param offset The start offset
821     * @param len The number of bytes to read
822     * @return the number of characters read or -1 if the end of stream
823     * @throws IOException if an I/O error occurs.
824     */
825    @Override
826    public int read(final char[] buf, final int offset, final int len) throws IOException {
827        return reader.read(buf, offset, len);
828    }
829
830}