LCOV - code coverage report
Current view: top level - lib/helper - xml-encoding-sniffer.js (source / functions) Hit Total Coverage
Test: lcov.info Lines: 379 543 69.8 %
Date: 2024-12-07 00:20:21 Functions: 7 7 100.0 %
Branches: 50 102 49.0 %

           Branch data     Line data    Source code
       1            [ + ]:        348 : /**
       2                 :        348 :  * W3C Extensible Markup Language (XML) 1.0 (Fifth Edition)
       3                 :        348 :  * W3C Recommendation 26 November 2008
       4                 :        348 :  * url: https://www.w3.org/TR/xml/#charencoding
       5                 :        348 :  *
       6                 :        348 :  * function xmlEncodingSniffer(sample)
       7                 :        348 :  * function getXmlDeclaredEncoding(sample,  guessedEncoding)
       8                 :        348 :  * function getStringFromByteArray(message, guessedEncoding)
       9                 :        348 :  * function detectUnicodeInByteSampleByHeuristics(sampleBytes)
      10                 :        348 :  * function detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos)
      11                 :        348 :  * function detectBOMBytes(bomBytes)
      12                 :        348 :  * function isCommonUSASCIIByte(byte)
      13                 :        348 :  *
      14                 :        348 :  * xmlEncodingSniffer                    => detectBOMBytes, getXmlDeclaredEncoding
      15                 :        348 :  * getXmlDeclaredEncoding                => getStringFromByteArray
      16                 :        348 :  * getStringFromByteArray                => detectBOMBytes, detectUnicodeInByteSampleByHeuristics
      17                 :        348 :  * detectUnicodeInByteSampleByHeuristics => detectSuspiciousUTF8SequenceLength
      18                 :        348 :  * detectSuspiciousUTF8SequenceLength    => NULL
      19                 :        348 :  * detectBOMBytes                        => NULL
      20                 :        348 :  * isCommonUSASCIIByte(byte)             => NULL
      21                 :        348 :  *
      22                 :        348 :  * @module  helper-xml-encoding-sniffer
      23                 :        348 :  * @desc    Helper module - Helper functions for the main module {@link module:whatwg-xhr whatwg-xhr}.
      24                 :        348 :  * @version 1.0.0
      25                 :        348 :  * @author  Essam A. El-Sherif
      26                 :        348 :  */
      27                 :        348 : 
      28                 :        348 : /**
      29                 :        348 :  * @func    xmlEncodingSniffer
      30                 :        348 :  * @static
      31                 :        348 :  * @param   {object} Sample Buffer data.
      32                 :        348 :  * @return  {string} Encoding detected.
      33                 :        348 :  * @desc    Auto-detection of character encoding of XML data.
      34                 :        348 :  */
      35            [ + ]:        348 : export function xmlEncodingSniffer(sample){
      36                 :         36 : 
      37                 :         36 :         let encodingFound1 = null;
      38                 :         36 :         let encodingFound2 = null;
      39                 :         36 : 
      40                 :         36 :         // look for the BOM in the read sample
      41                 :         36 :         encodingFound1 = detectBOMBytes(sample);
      42                 :         36 : 
      43                 :         36 :         // if the encoding was not detected due to a missing or unrecognizable BOM,
      44                 :         36 :         // try to detect from the binary representation of the string "<?xml"
      45                 :         36 :         let checkPseudoAttribute = false;
      46                 :         36 : 
      47            [ + ]:         36 :         if(encodingFound1 === null){
      48                 :         20 : 
      49  [ - ][ - ][ - ]:         20 :                 if(sample[0] === 0x00 && sample[1] === 0x3C && sample[2] === 0x00 && sample[3] === 0x3F){
                    [ - ]
      50                 :          0 : 
      51                 :          0 :                         // UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit
      52                 :          0 :                         // in big-endian order and ASCII characters encoded as ASCII values.
      53                 :          0 :                         // (the encoding declaration must be read to determine which)
      54                 :          0 : 
      55                 :          0 :                         encodingFound1 = 'UTF-16BE';
      56                 :          0 :                         checkPseudoAttribute = true;
      57                 :          0 :                 }
      58                 :         20 :                 else
      59  [ - ][ - ][ - ]:         20 :                 if(sample[0] === 0x00 && sample[1] === 0x00 && sample[2] === 0x00 && sample[3] === 0x3C){
                    [ - ]
      60                 :          0 : 
      61                 :          0 :                         // UTF-32BE
      62                 :          0 :                         // (the encoding declaration must be read to determine which)
      63                 :          0 : 
      64                 :          0 :                         encodingFound1 = 'UTF-32BE';
      65                 :          0 :                         checkPseudoAttribute = true;
      66                 :          0 :                 }
      67                 :         20 :                 else
      68       [ - ][ - ]:         20 :                 if(sample[0] === 0xFF && sample[1] === 0xFE){
      69                 :          0 : 
      70                 :          0 :                         encodingFound1 = 'UTF-16LE';
      71                 :          0 :                 }
      72                 :         20 :                 else
      73       [ - ][ - ]:         20 :                 if(sample[0] === 0xFE && sample[1] === 0xFF){
      74                 :          0 : 
      75                 :          0 :                         encodingFound1 = 'UTF-16BE';
      76                 :          0 :                 }
      77                 :         20 :                 else
      78  [ + ][ - ][ - ]:         20 :                 if(sample[0] === 0x3C && sample[1] === 0x00 && sample[2] === 0x00 && sample[3] === 0x00){
                    [ - ]
      79                 :          0 : 
      80                 :          0 :                         // (the encoding declaration must be read to determine which)
      81                 :          0 : 
      82                 :          0 :                         encodingFound1 = 'UTF-32';
      83                 :          0 :                         checkPseudoAttribute = true;
      84                 :          0 :                 }
      85                 :         20 :                 else
      86  [ + ][ - ][ - ]:         20 :                 if(sample[0] === 0x3C && sample[1] === 0x00 && sample[2] === 0x3F && sample[3] === 0x00){
                    [ - ]
      87                 :          0 : 
      88                 :          0 :                         // UTF-16LE or little-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit
      89                 :          0 :                         // in little-endian order and ASCII characters encoded as ASCII values
      90                 :          0 :                         // (the encoding declaration must be read to determine which)
      91                 :          0 : 
      92                 :          0 :                         encodingFound1 = 'UTF-16LE';
      93                 :          0 :                         checkPseudoAttribute = true;
      94                 :          0 :                 }
      95                 :         20 :                 else
      96  [ + ][ + ][ + ]:         20 :                 if(sample[0] === 0x3C && sample[1] === 0x3F && sample[2] === 0x78 && sample[3] === 0x6D){
                    [ + ]
      97                 :          1 : 
      98                 :          1 :                         // UTF-8, ISO 646, ASCII, some part of ISO 8859 or any other 7-bit, 8-bit
      99                 :          1 :                         // (the encoding declaration must be read to determine which)
     100                 :          1 : 
     101                 :          1 :                         encodingFound1 = 'ASCII';
     102                 :          1 :                         checkPseudoAttribute = true;
     103            [ + ]:          1 :                 }
     104                 :         19 :                 else
     105  [ - ][ - ][ - ]:         19 :                 if(sample[0] === 0x4C && sample[1] === 0x6F && sample[2] === 0xA7 && sample[3] === 0x94){
                    [ - ]
     106                 :          0 : 
     107                 :          0 :                     // IBM037 - IBM EBCDIC US-Canada"CP037";
     108                 :          0 : 
     109                 :          0 :                         encodingFound1 = 'IBM037';
     110                 :          0 :                 }
     111                 :         36 :         }       // if (encodingFound1 === null)
     112                 :         36 : 
     113                 :         36 :         // Now read the encoding pseudoattribute in the XML header, if present
     114            [ + ]:         36 :         encodingFound2 = getXmlDeclaredEncoding(sample, encodingFound1 || 'UTF-8');
     115                 :         36 : 
     116                 :         36 :         // when not declared, w3c says it is utf-8
     117                 :         36 :         if(encodingFound2 === null)
     118            [ + ]:         36 :                 encodingFound2 = 'UTF-8';
     119                 :         36 : 
     120                 :         36 :         // compare the 2 found encoding and decided which is the right one
     121                 :         36 : 
     122                 :         36 :         let winner = null;
     123            [ + ]:         36 :         if(encodingFound1 === encodingFound2){
     124                 :          2 :                 winner = encodingFound2;
     125            [ + ]:          2 :         }
     126                 :         22 :         else
     127            [ + ]:         22 :         if(encodingFound1 === null){
     128                 :         19 :                 winner = encodingFound2;
     129            [ + ]:         19 :         }
     130                 :          3 :         else
     131                 :          3 :         if(encodingFound2 === null){
     132                 :          3 :                 winner = encodingFound1;
     133                 :          3 :         }
     134                 :          3 :         else
     135            [ + ]:          3 :         if(checkPseudoAttribute){
     136                 :          1 :                 // Fine-tune the winner encoding. This is the most heuristic part, as some encoding
     137                 :          1 :                 // can be overloaded. E.g. ASCII might be UTF-7, UTF-8, ISO-8859...
     138                 :          1 : 
     139                 :          1 :                 if(
     140                 :          1 :                         (encodingFound1 === 'ASCII') &&
     141                 :          1 :                         (encodingFound2 === 'UTF-7' || encodingFound2 === 'UTF-8' || encodingFound2.toUpperCase().includes('ISO-8859'))
     142                 :          1 :                 ){
     143                 :          1 :                         winner = encodingFound2;
     144                 :          1 :                 }
     145                 :          1 :                 else{
     146                 :          1 :                         // I'm not sure here if throw an exception or accept encodingFound1 or encodingFound2,
     147                 :          1 :                         // as both are not null and not equals
     148                 :          1 :                         // throw new Error(
     149                 :          1 :                         //      `The text encoding and the encoding pseudo-attribute of the XML header mismatch ${encodingFound1} ${encodingFound2}`);
     150                 :          1 : 
     151                 :          1 :                         winner = encodingFound2;
     152                 :          1 :                 }
     153            [ + ]:          1 :         }
     154                 :          2 :         else{
     155                 :          2 :                         // encodingFound1 and encodingFound2 are different so none win
     156                 :          2 :                         // throw new Error(
     157                 :          2 :                         //      `The text encoding and the encoding pseudo-attribute of the XML header mismatch ${encodingFound1} ${encodingFound2}`);
     158                 :          2 : 
     159                 :          2 :                         winner = encodingFound2;
     160                 :          2 :         }
     161                 :         36 : 
     162                 :         36 :         // return the detected encoding
     163                 :         36 :         return winner;
     164                 :         36 : }
     165                 :        348 : 
     166            [ + ]:         36 : function getXmlDeclaredEncoding(sample, guessedEncoding){
     167                 :         36 : 
     168                 :         36 :         // capture the encoding from the xml declaraion
     169                 :         36 :         let contents = getStringFromByteArray(sample, guessedEncoding);
     170                 :         36 : 
     171                 :         36 :         let pattern = /<\?xml\s+version\=["']1\.0["']\s+encoding\=["'](?<encoding>[\w\-]+)["']/;
     172                 :         36 : 
     173                 :         36 :         let m = contents.match(pattern);
     174                 :         36 : 
     175            [ + ]:         36 :         return m && m.groups['encoding'];
     176                 :         36 : }
     177                 :        348 : 
     178            [ + ]:         36 : function getStringFromByteArray(message, guessedEncoding){
     179                 :         36 : 
     180                 :         36 :         // try to get the encoding from the byte array
     181                 :         36 :         let encodingFound = detectBOMBytes(message);
     182                 :         36 : 
     183            [ + ]:         36 :         if(encodingFound){
     184                 :          4 :                 let preamble;
     185                 :          4 : 
     186                 :          4 :                 switch(encodingFound){
     187                 :          4 :                         case 'UTF-16LE':
     188                 :          4 :                         case 'UTF-16LE':
     189                 :          4 :                                 preamble = 2;
     190                 :          4 :                                 break;
     191                 :          4 :                         case 'UTF-7':
     192            [ + ]:          4 :                         case 'UTF-8':
     193                 :          2 :                                 preamble = 3;
     194                 :          2 :                                 break;
     195                 :          4 :                         case 'UTF-32LE':
     196                 :          4 :                         case 'UTF-32BE':
     197                 :          4 :                                 preamble = 4;
     198                 :          4 :                                 break;
     199            [ + ]:          4 :                         default:
     200                 :          2 :                                 preamble = 0;
     201                 :          4 :                 }
     202                 :          4 : 
     203                 :          4 :                 let decoder = new TextDecoder(encodingFound);
     204                 :          4 : 
     205                 :          4 :                 return new TextDecoder(encodingFound).decode(message.subarray(preamble));
     206            [ + ]:          4 :         }
     207                 :         20 : 
     208                 :         36 :         encodingFound = detectUnicodeInByteSampleByHeuristics(message) || guessedEncoding;
     209                 :         36 : 
     210                 :         36 :         return new TextDecoder(encodingFound).decode( new Uint8Array(message).buffer );
     211                 :         36 : }
     212                 :        348 : 
     213            [ + ]:         32 : function detectUnicodeInByteSampleByHeuristics(sampleBytes){
     214                 :         32 : 
     215                 :         32 :         let oddBinaryNullsInSample      = 0;
     216                 :         32 :         let evenBinaryNullsInSample     = 0;
     217                 :         32 : 
     218                 :         32 :         let suspiciousUTF8SequenceCount = 0;
     219                 :         32 :         let suspiciousUTF8BytesTotal    = 0;
     220                 :         32 : 
     221                 :         32 :         let likelyUSASCIIBytesInSample  = 0;
     222                 :         32 : 
     223                 :         32 :         // Cycle through, keeping count of binary null positions, possible UTF-8
     224                 :         32 :         // sequences from upper ranges of Windows-1252, and probable US-ASCII
     225                 :         32 :         // character counts.
     226                 :         32 : 
     227                 :         32 :         let currentPos    = 0;
     228                 :         32 :         let skipUTF8Bytes = 0;
     229                 :         32 : 
     230            [ + ]:         32 :         while(currentPos < sampleBytes.length){
     231                 :        536 : 
     232                 :        536 :                 //binary null distribution
     233            [ - ]:        536 :                 if(sampleBytes[currentPos] === 0){
     234                 :          0 :                         if(currentPos % 2 === 0)
     235                 :          0 :                                 evenBinaryNullsInSample++;
     236                 :          0 :                         else
     237                 :          0 :                                 oddBinaryNullsInSample++;
     238                 :          0 :                 }
     239                 :        536 : 
     240                 :        536 :                 //likely US-ASCII characters
     241            [ + ]:        536 :                 if(isCommonUSASCIIByte(sampleBytes[currentPos])){
     242                 :        309 :                         likelyUSASCIIBytesInSample++;
     243                 :        309 :                 }
     244                 :        536 : 
     245                 :        536 :                 //suspicious sequences (look like UTF-8)
     246                 :        536 :                 if(skipUTF8Bytes === 0){
     247                 :        536 : 
     248                 :        536 :                         let lengthFound = detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos);
     249                 :        536 : 
     250            [ - ]:        536 :                         if(lengthFound > 0){
     251                 :          0 : 
     252                 :          0 :                                 suspiciousUTF8SequenceCount++;
     253                 :          0 :                                 suspiciousUTF8BytesTotal += lengthFound;
     254                 :          0 :                                 skipUTF8Bytes = lengthFound - 1;
     255                 :          0 :                         }
     256            [ - ]:        536 :                 }
     257                 :          0 :                 else{
     258                 :          0 :                         skipUTF8Bytes--;
     259                 :          0 :                 }
     260                 :        536 : 
     261                 :        536 :                 currentPos++;
     262                 :        536 :         }
     263                 :         32 : 
     264                 :         32 :         // UTF-16 LE - in english / european environments, this is usually characterized by a
     265                 :         32 :         // high proportion of odd binary nulls (starting at 0), with (as this is text) a low
     266                 :         32 :         // proportion of even binary nulls.
     267                 :         32 :         // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
     268                 :         32 :         // 60% nulls where you do expect nulls) are completely arbitrary.
     269                 :         32 : 
     270                 :         32 :         if(
     271                 :         32 :                 ((evenBinaryNullsInSample * 2.0) / sampleBytes.length) < 0.2 &&
     272                 :         32 :                 ((oddBinaryNullsInSample * 2.0)  / sampleBytes.length) > 0.6
     273                 :         32 :         )
     274            [ - ]:         32 :                 return 'UTF-16LE';
     275                 :         32 : 
     276                 :         32 :         // UTF-16 BE - in english / european environments, this is usually characterized by a
     277                 :         32 :         // high proportion of even binary nulls (starting at 0), with (as this is text) a low
     278                 :         32 :         // proportion of odd binary nulls.
     279                 :         32 :         // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
     280                 :         32 :         // 60% nulls where you do expect nulls) are completely arbitrary.
     281                 :         32 : 
     282                 :         32 :         if(
     283                 :         32 :                 ((oddBinaryNullsInSample * 2.0) / sampleBytes.length)  < 0.2 &&
     284                 :         32 :                 ((evenBinaryNullsInSample * 2.0) / sampleBytes.length) > 0.6
     285                 :         32 :         )
     286            [ - ]:         32 :                 return 'UTF-16BE';
     287                 :         32 : 
     288                 :         32 :         // UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
     289                 :         32 :         // using regexp, in his w3c.org unicode FAQ entry:
     290                 :         32 :         // http://www.w3.org/International/questions/qa-forms-utf-8
     291                 :         32 : 
     292                 :         32 :         let potentiallyMangledString = sampleBytes.toString('ascii');
     293                 :         32 : 
     294                 :         32 :         let utf8Validator = '';
     295                 :         32 : 
     296                 :         32 :         utf8Validator += '^(';
     297                 :         32 :         utf8Validator += '[\\x00-\\x7F]';                            // ASCII
     298                 :         32 :         utf8Validator += '|[\\xC2-\\xDF][\\x80-\\xBF]';              // non-overlong 2-byte
     299                 :         32 :         utf8Validator += '|\\xE0[\\xA0-\\xBF][\\x80-\\xBF]';         // excluding overlongs
     300                 :         32 :         utf8Validator += '|[\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2}'; // straight 3-byte
     301                 :         32 :         utf8Validator += '|\\xED[\\x80-\\x9F][\\x80-\\xBF]';         // excluding surrogates
     302                 :         32 :         utf8Validator += '|\\xF0[\\x90-\\xBF][\\x80-\\xBF]{2}';      // planes 1-3
     303                 :         32 :         utf8Validator += '|[\\xF1-\\xF3][\\x80-\\xBF]{3}';           // planes 4-15
     304                 :         32 :         utf8Validator += '|\\xF4[\\x80-\\x8F][\\x80-\\xBF]{2}';      // plane 16
     305                 :         32 :         utf8Validator += ')*$';
     306                 :         32 : 
     307                 :         32 :         utf8Validator = new RegExp(utf8Validator);
     308                 :         32 : 
     309                 :         32 :         if(utf8Validator.test(potentiallyMangledString)){
     310                 :         32 : 
     311                 :         32 :                 // Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
     312                 :         32 :                 // If all the characters are in the 0-127 range, no harm done, most western charsets are same
     313                 :         32 :                 // as UTF-8 in these ranges.
     314                 :         32 :                 // If some of the characters were in the upper range (western accented characters), however,
     315                 :         32 :                 // they would likely be mangled to 2-byte by the UTF-8 encoding process.
     316                 :         32 :                 // So, we need to play stats.
     317                 :         32 : 
     318                 :         32 :                 // The "Random" likelihood of any pair of randomly generated characters being one
     319                 :         32 :                 // of these "suspicious" character sequences is:
     320                 :         32 :                 // 128 / (256 * 256) = 0.2%.
     321                 :         32 :                 //
     322                 :         32 :                 // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
     323                 :         32 :                 // character range, so we assume that more than 1 in 500,000 of these character
     324                 :         32 :                 // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
     325                 :         32 :                 //
     326                 :         32 :                 // We can only assume these character sequences will be rare if we ALSO assume that this
     327                 :         32 :                 // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
     328                 :         32 :                 // not already suspicious sequences) should be plain US-ASCII bytes. This, I
     329                 :         32 :                 // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
     330                 :         32 :                 // approx 40%, so the chances of hitting this threshold by accident in random data are
     331                 :         32 :                 // VERY low).
     332                 :         32 : 
     333                 :         32 :                 if
     334                 :         32 :                         (
     335            [ - ]:         32 :                                 (suspiciousUTF8SequenceCount * 500000.0 / sampleBytes.length >= 1) && //suspicious sequences
     336                 :          0 :                                 ( //all suspicious, so cannot evaluate proportion of US-Ascii
     337                 :          0 :                                     (sampleBytes.length - suspiciousUTF8BytesTotal === 0) ||
     338                 :          0 :                                         likelyUSASCIIBytesInSample * 1.0 / (sampleBytes.length - suspiciousUTF8BytesTotal) >= 0.8
     339                 :          0 :                                 )
     340                 :         32 :                 )
     341            [ - ]:         32 :                         return 'UTF-8';
     342                 :         32 :         }
     343                 :         32 : 
     344                 :         32 :         return null;
     345                 :         32 : }
     346                 :        348 : 
     347            [ + ]:        536 : function detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos){
     348                 :        536 : 
     349                 :        536 :         let lengthFound = 0;
     350                 :        536 : 
     351                 :        536 :                 if(
     352                 :        536 :                         sampleBytes.length >= currentPos + 1 &&
     353                 :        536 :                         sampleBytes[currentPos] === 0xC2
     354            [ - ]:        536 :                 ){
     355                 :          0 :                         if(
     356                 :          0 :                                 sampleBytes[currentPos + 1] === 0x81 ||
     357                 :          0 :                                 sampleBytes[currentPos + 1] === 0x8D ||
     358                 :          0 :                                 sampleBytes[currentPos + 1] === 0x8F
     359                 :          0 :                         )
     360                 :          0 :                                 lengthFound = 2;
     361                 :          0 :                         else
     362                 :          0 :                         if(
     363                 :          0 :                                 sampleBytes[currentPos + 1] === 0x90 ||
     364                 :          0 :                                 sampleBytes[currentPos + 1] === 0x9D
     365                 :          0 :                         )
     366                 :          0 :                                 lengthFound = 2;
     367                 :          0 :                         else
     368                 :          0 :                         if(
     369                 :          0 :                                 sampleBytes[currentPos + 1] >= 0xA0 &&
     370                 :          0 :                                 sampleBytes[currentPos + 1] <= 0xBF
     371                 :          0 :                         )
     372                 :          0 :                                 lengthFound = 2;
     373                 :          0 :                 }
     374                 :        536 :                 else
     375                 :        536 :                 if(
     376                 :        536 :                         sampleBytes.length >= currentPos + 1 &&
     377                 :        536 :                         sampleBytes[currentPos] === 0xC3
     378            [ - ]:        536 :                 ){
     379                 :          0 :                         if(
     380                 :          0 :                                 sampleBytes[currentPos + 1] >= 0x80 &&
     381                 :          0 :                                 sampleBytes[currentPos + 1] <= 0xBF
     382                 :          0 :                         )
     383                 :          0 :                                 lengthFound = 2;
     384                 :          0 :                 }
     385                 :        536 :                 else
     386                 :        536 :                 if(
     387                 :        536 :                         sampleBytes.length >= currentPos + 1 &&
     388                 :        536 :                         sampleBytes[currentPos] === 0xC5
     389            [ - ]:        536 :                 ){
     390                 :          0 :                         if(
     391                 :          0 :                                 sampleBytes[currentPos + 1] === 0x92 ||
     392                 :          0 :                                 sampleBytes[currentPos + 1] === 0x93
     393                 :          0 :                         )
     394                 :          0 :                                 lengthFound = 2;
     395                 :          0 :                         else
     396                 :          0 :                         if(
     397                 :          0 :                                 sampleBytes[currentPos + 1] === 0xA0 ||
     398                 :          0 :                                 sampleBytes[currentPos + 1] === 0xA1
     399                 :          0 :                         )
     400                 :          0 :                                 lengthFound = 2;
     401                 :          0 :                         else
     402                 :          0 :                         if(
     403                 :          0 :                                 sampleBytes[currentPos + 1] === 0xB8 ||
     404                 :          0 :                                 sampleBytes[currentPos + 1] === 0xBD ||
     405                 :          0 :                                 sampleBytes[currentPos + 1] === 0xBE
     406                 :          0 :                         )
     407                 :          0 :                                 lengthFound = 2;
     408                 :          0 :                 }
     409                 :        536 :                 else
     410                 :        536 :                 if(
     411                 :        536 :                         sampleBytes.length >= currentPos + 1 &&
     412                 :        536 :                         sampleBytes[currentPos] === 0xC6
     413            [ - ]:        536 :                 ){
     414                 :          0 :                         if(sampleBytes[currentPos + 1] === 0x92)
     415                 :          0 :                                 lengthFound = 2;
     416                 :          0 :                 }
     417                 :        536 :                 else
     418                 :        536 :                 if(
     419                 :        536 :                         sampleBytes.length >= currentPos + 1 &&
     420                 :        536 :                         sampleBytes[currentPos] === 0xCB
     421            [ - ]:        536 :                 ){
     422                 :          0 :                         if(
     423                 :          0 :                                 sampleBytes[currentPos + 1] === 0x86 ||
     424                 :          0 :                                 sampleBytes[currentPos + 1] === 0x9C
     425                 :          0 :                         )
     426                 :          0 :                                 lengthFound = 2;
     427                 :          0 :                 }
     428                 :        536 :                 else
     429                 :        536 :                 if(
     430            [ + ]:        536 :                         sampleBytes.length >= currentPos + 2 &&
     431                 :        504 :                         sampleBytes[currentPos] === 0xE2
     432            [ - ]:        536 :                 ){
     433                 :          0 :                         if(sampleBytes[currentPos + 1] === 0x80){
     434                 :          0 :                                 if(
     435                 :          0 :                                         sampleBytes[currentPos + 2] === 0x93 ||
     436                 :          0 :                                         sampleBytes[currentPos + 2] === 0x94
     437                 :          0 :                                 )
     438                 :          0 :                                         lengthFound = 3;
     439                 :          0 : 
     440                 :          0 :                                 if(
     441                 :          0 :                                         sampleBytes[currentPos + 2] === 0x98 ||
     442                 :          0 :                                         sampleBytes[currentPos + 2] === 0x99 ||
     443                 :          0 :                                         sampleBytes[currentPos + 2] === 0x9A
     444                 :          0 :                                 )
     445                 :          0 :                                         lengthFound = 3;
     446                 :          0 : 
     447                 :          0 :                                 if(
     448                 :          0 :                                         sampleBytes[currentPos + 2] === 0x9C ||
     449                 :          0 :                                         sampleBytes[currentPos + 2] === 0x9D ||
     450                 :          0 :                                         sampleBytes[currentPos + 2] === 0x9E
     451                 :          0 :                                 )
     452                 :          0 :                                         lengthFound = 3;
     453                 :          0 : 
     454                 :          0 :                                 if(
     455                 :          0 :                                         sampleBytes[currentPos + 2] === 0xA0 ||
     456                 :          0 :                                         sampleBytes[currentPos + 2] === 0xA1 ||
     457                 :          0 :                                         sampleBytes[currentPos + 2] === 0xA2
     458                 :          0 :                                 )
     459                 :          0 :                                         lengthFound = 3;
     460                 :          0 : 
     461                 :          0 :                                 if(sampleBytes[currentPos + 2] === 0xA6)
     462                 :          0 :                                         lengthFound = 3;
     463                 :          0 : 
     464                 :          0 :                                 if(sampleBytes[currentPos + 2] === 0xB0)
     465                 :          0 :                                         lengthFound = 3;
     466                 :          0 : 
     467                 :          0 :                                 if(
     468                 :          0 :                                         sampleBytes[currentPos + 2] === 0xB9 ||
     469                 :          0 :                                         sampleBytes[currentPos + 2] === 0xBA
     470                 :          0 :                                 )
     471                 :          0 :                                         lengthFound = 3;
     472                 :          0 :                         }
     473                 :          0 :                         else
     474                 :          0 :                         if(
     475                 :          0 :                                 sampleBytes[currentPos + 1] === 0x82 &&
     476                 :          0 :                                 sampleBytes[currentPos + 2] === 0xAC
     477                 :          0 :                         )
     478                 :          0 :                                 lengthFound = 3;
     479                 :          0 :                         else
     480                 :          0 :                         if(
     481                 :          0 :                                 sampleBytes[currentPos + 1] === 0x84 &&
     482                 :          0 :                                 sampleBytes[currentPos + 2] === 0xA2
     483                 :          0 :                         )
     484                 :          0 :                                 lengthFound = 3;
     485                 :          0 :                 }
     486                 :        536 : 
     487                 :        536 :         return lengthFound;
     488                 :        536 : }
     489                 :        348 : 
     490            [ + ]:         72 : function detectBOMBytes(bomBytes){
     491                 :         72 : 
     492            [ - ]:         72 :         if (bomBytes.length < 2) return null;
     493                 :         72 : 
     494                 :         72 :         // UTF-16LE - Unicode UTF-16 little endian byte order
     495                 :         72 :         if (
     496            [ - ]:         72 :                 bomBytes[0] === 0xFF &&
     497            [ - ]:         72 :                 bomBytes[1] === 0xFE &&
     498                 :          0 :                 (bomBytes.length < 4 || bomBytes[2] != 0x00 || bomBytes[3] != 0x00)
     499                 :         72 :         )
     500            [ - ]:         72 :                 return 'UTF-16LE';
     501                 :         72 : 
     502                 :         72 :         // UTF-16BE - Unicode UTF-16 big endian byte order
     503            [ + ]:         72 :         if (bomBytes[0] === 0xFE && bomBytes[1] === 0xFF)
     504       [ + ][ + ]:         72 :                 return 'UTF-16BE';
     505                 :         44 : 
     506       [ + ][ - ]:         72 :         if (bomBytes.length < 3) return null;
     507                 :         44 : 
     508                 :         44 :         // UTF-8
     509       [ + ][ + ]:         72 :         if (bomBytes[0] === 0xEF && bomBytes[1] === 0xBB && bomBytes[2] === 0xBF)
     510       [ + ][ + ]:         72 :                 return 'UTF-8';
     511                 :         40 : 
     512                 :         40 :         // Character encodings such as UTF-7 that make overloaded usage of ASCII-valued
     513                 :         40 :         // bytes may fail to be reliably detected
     514       [ - ][ - ]:         72 :         if (bomBytes[0] === 0x2B && bomBytes[1] === 0x2F && bomBytes[2] === 0x76)
     515       [ + ][ - ]:         72 :                 return 'UTF-7';
     516                 :         40 : 
     517       [ + ][ - ]:         72 :         if (bomBytes.length < 4) return null;
     518                 :         40 : 
     519                 :         40 :         // UTF-32LE - Unicode UTF-32 little endian byte order
     520  [ - ][ - ][ - ]:         72 :         if (bomBytes[0] === 0xFF && bomBytes[1] === 0xFE && bomBytes[2] === 0x00 && bomBytes[3] === 0x00)
     521       [ + ][ - ]:         72 :                 return 'UTF-32LE';
     522                 :         40 : 
     523                 :         40 :         // UTF-32BE - Unicode UTF-32 big endian byte order
     524  [ - ][ - ][ - ]:         72 :         if (bomBytes[0] === 0x00 && bomBytes[1] === 0x00 && bomBytes[2] === 0xFE && bomBytes[3] === 0xFF)
     525       [ + ][ - ]:         72 :                 return 'UTF-32BE';
     526                 :         40 : 
     527                 :         40 :         return null;
     528                 :         72 : }
     529                 :        348 : 
     530            [ + ]:        536 : function isCommonUSASCIIByte(byte){
     531                 :        536 :         return (
     532                 :        536 :                 byte === 0x0A || //lf
     533                 :        536 :                 byte === 0x0D || //cr
     534                 :        536 :                 byte === 0x09 || //tab
     535            [ + ]:        536 :                 (byte >= 0x20 && byte <= 0x2F) || //common punctuation
     536            [ + ]:        536 :                 (byte >= 0x30 && byte <= 0x39) || //digits
     537            [ + ]:        536 :                 (byte >= 0x3A && byte <= 0x40) || //common punctuation
     538            [ + ]:        536 :                 (byte >= 0x41 && byte <= 0x5A) || //capital letters
     539            [ + ]:        536 :                 (byte >= 0x5B && byte <= 0x60) || //common punctuation
     540            [ + ]:        536 :                 (byte >= 0x61 && byte <= 0x7A) || //lowercase letters
     541                 :        536 :                 (byte >= 0x7B && byte <= 0x7E)    //common punctuation
     542                 :        536 :         );
     543                 :        536 : }

Generated by: LCOV version 1.14