Source: lib/helper/xml-encoding-sniffer.js

  1. /**
  2. * W3C Extensible Markup Language (XML) 1.0 (Fifth Edition)
  3. * W3C Recommendation 26 November 2008
  4. * url: https://www.w3.org/TR/xml/#charencoding
  5. *
  6. * function xmlEncodingSniffer(sample)
  7. * function getXmlDeclaredEncoding(sample, guessedEncoding)
  8. * function getStringFromByteArray(message, guessedEncoding)
  9. * function detectUnicodeInByteSampleByHeuristics(sampleBytes)
  10. * function detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos)
  11. * function detectBOMBytes(bomBytes)
  12. * function isCommonUSASCIIByte(byte)
  13. *
  14. * xmlEncodingSniffer => detectBOMBytes, getXmlDeclaredEncoding
  15. * getXmlDeclaredEncoding => getStringFromByteArray
  16. * getStringFromByteArray => detectBOMBytes, detectUnicodeInByteSampleByHeuristics
  17. * detectUnicodeInByteSampleByHeuristics => detectSuspiciousUTF8SequenceLength
  18. * detectSuspiciousUTF8SequenceLength => NULL
  19. * detectBOMBytes => NULL
  20. * isCommonUSASCIIByte(byte) => NULL
  21. *
  22. * @module helper-xml-encoding-sniffer
  23. * @desc Helper module - Helper functions for the main module {@link module:whatwg-xhr whatwg-xhr}.
  24. * @version 1.0.0
  25. * @author Essam A. El-Sherif
  26. */
  27. /**
  28. * @func xmlEncodingSniffer
  29. * @static
  30. * @param {object} Sample Buffer data.
  31. * @return {string} Encoding detected.
  32. * @desc Auto-detection of character encoding of XML data.
  33. */
  34. export function xmlEncodingSniffer(sample){
  35. let encodingFound1 = null;
  36. let encodingFound2 = null;
  37. // look for the BOM in the read sample
  38. encodingFound1 = detectBOMBytes(sample);
  39. // if the encoding was not detected due to a missing or unrecognizable BOM,
  40. // try to detect from the binary representation of the string "<?xml"
  41. let checkPseudoAttribute = false;
  42. if(encodingFound1 === null){
  43. if(sample[0] === 0x00 && sample[1] === 0x3C && sample[2] === 0x00 && sample[3] === 0x3F){
  44. // UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit
  45. // in big-endian order and ASCII characters encoded as ASCII values.
  46. // (the encoding declaration must be read to determine which)
  47. encodingFound1 = 'UTF-16BE';
  48. checkPseudoAttribute = true;
  49. }
  50. else
  51. if(sample[0] === 0x00 && sample[1] === 0x00 && sample[2] === 0x00 && sample[3] === 0x3C){
  52. // UTF-32BE
  53. // (the encoding declaration must be read to determine which)
  54. encodingFound1 = 'UTF-32BE';
  55. checkPseudoAttribute = true;
  56. }
  57. else
  58. if(sample[0] === 0xFF && sample[1] === 0xFE){
  59. encodingFound1 = 'UTF-16LE';
  60. }
  61. else
  62. if(sample[0] === 0xFE && sample[1] === 0xFF){
  63. encodingFound1 = 'UTF-16BE';
  64. }
  65. else
  66. if(sample[0] === 0x3C && sample[1] === 0x00 && sample[2] === 0x00 && sample[3] === 0x00){
  67. // (the encoding declaration must be read to determine which)
  68. encodingFound1 = 'UTF-32';
  69. checkPseudoAttribute = true;
  70. }
  71. else
  72. if(sample[0] === 0x3C && sample[1] === 0x00 && sample[2] === 0x3F && sample[3] === 0x00){
  73. // UTF-16LE or little-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit
  74. // in little-endian order and ASCII characters encoded as ASCII values
  75. // (the encoding declaration must be read to determine which)
  76. encodingFound1 = 'UTF-16LE';
  77. checkPseudoAttribute = true;
  78. }
  79. else
  80. if(sample[0] === 0x3C && sample[1] === 0x3F && sample[2] === 0x78 && sample[3] === 0x6D){
  81. // UTF-8, ISO 646, ASCII, some part of ISO 8859 or any other 7-bit, 8-bit
  82. // (the encoding declaration must be read to determine which)
  83. encodingFound1 = 'ASCII';
  84. checkPseudoAttribute = true;
  85. }
  86. else
  87. if(sample[0] === 0x4C && sample[1] === 0x6F && sample[2] === 0xA7 && sample[3] === 0x94){
  88. // IBM037 - IBM EBCDIC US-Canada"CP037";
  89. encodingFound1 = 'IBM037';
  90. }
  91. } // if (encodingFound1 === null)
  92. // Now read the encoding pseudoattribute in the XML header, if present
  93. encodingFound2 = getXmlDeclaredEncoding(sample, encodingFound1 || 'UTF-8');
  94. // when not declared, w3c says it is utf-8
  95. if(encodingFound2 === null)
  96. encodingFound2 = 'UTF-8';
  97. // compare the 2 found encoding and decided which is the right one
  98. let winner = null;
  99. if(encodingFound1 === encodingFound2){
  100. winner = encodingFound2;
  101. }
  102. else
  103. if(encodingFound1 === null){
  104. winner = encodingFound2;
  105. }
  106. else
  107. if(encodingFound2 === null){
  108. winner = encodingFound1;
  109. }
  110. else
  111. if(checkPseudoAttribute){
  112. // Fine-tune the winner encoding. This is the most heuristic part, as some encoding
  113. // can be overloaded. E.g. ASCII might be UTF-7, UTF-8, ISO-8859...
  114. if(
  115. (encodingFound1 === 'ASCII') &&
  116. (encodingFound2 === 'UTF-7' || encodingFound2 === 'UTF-8' || encodingFound2.toUpperCase().includes('ISO-8859'))
  117. ){
  118. winner = encodingFound2;
  119. }
  120. else{
  121. // I'm not sure here if throw an exception or accept encodingFound1 or encodingFound2,
  122. // as both are not null and not equals
  123. // throw new Error(
  124. // `The text encoding and the encoding pseudo-attribute of the XML header mismatch ${encodingFound1} ${encodingFound2}`);
  125. winner = encodingFound2;
  126. }
  127. }
  128. else{
  129. // encodingFound1 and encodingFound2 are different so none win
  130. // throw new Error(
  131. // `The text encoding and the encoding pseudo-attribute of the XML header mismatch ${encodingFound1} ${encodingFound2}`);
  132. winner = encodingFound2;
  133. }
  134. // return the detected encoding
  135. return winner;
  136. }
  137. function getXmlDeclaredEncoding(sample, guessedEncoding){
  138. // capture the encoding from the xml declaraion
  139. let contents = getStringFromByteArray(sample, guessedEncoding);
  140. let pattern = /<\?xml\s+version\=["']1\.0["']\s+encoding\=["'](?<encoding>[\w\-]+)["']/;
  141. let m = contents.match(pattern);
  142. return m && m.groups['encoding'];
  143. }
  144. function getStringFromByteArray(message, guessedEncoding){
  145. // try to get the encoding from the byte array
  146. let encodingFound = detectBOMBytes(message);
  147. if(encodingFound){
  148. let preamble;
  149. switch(encodingFound){
  150. case 'UTF-16LE':
  151. case 'UTF-16LE':
  152. preamble = 2;
  153. break;
  154. case 'UTF-7':
  155. case 'UTF-8':
  156. preamble = 3;
  157. break;
  158. case 'UTF-32LE':
  159. case 'UTF-32BE':
  160. preamble = 4;
  161. break;
  162. default:
  163. preamble = 0;
  164. }
  165. let decoder = new TextDecoder(encodingFound);
  166. return new TextDecoder(encodingFound).decode(message.subarray(preamble));
  167. }
  168. encodingFound = detectUnicodeInByteSampleByHeuristics(message) || guessedEncoding;
  169. return new TextDecoder(encodingFound).decode( new Uint8Array(message).buffer );
  170. }
  171. function detectUnicodeInByteSampleByHeuristics(sampleBytes){
  172. let oddBinaryNullsInSample = 0;
  173. let evenBinaryNullsInSample = 0;
  174. let suspiciousUTF8SequenceCount = 0;
  175. let suspiciousUTF8BytesTotal = 0;
  176. let likelyUSASCIIBytesInSample = 0;
  177. // Cycle through, keeping count of binary null positions, possible UTF-8
  178. // sequences from upper ranges of Windows-1252, and probable US-ASCII
  179. // character counts.
  180. let currentPos = 0;
  181. let skipUTF8Bytes = 0;
  182. while(currentPos < sampleBytes.length){
  183. //binary null distribution
  184. if(sampleBytes[currentPos] === 0){
  185. if(currentPos % 2 === 0)
  186. evenBinaryNullsInSample++;
  187. else
  188. oddBinaryNullsInSample++;
  189. }
  190. //likely US-ASCII characters
  191. if(isCommonUSASCIIByte(sampleBytes[currentPos])){
  192. likelyUSASCIIBytesInSample++;
  193. }
  194. //suspicious sequences (look like UTF-8)
  195. if(skipUTF8Bytes === 0){
  196. let lengthFound = detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos);
  197. if(lengthFound > 0){
  198. suspiciousUTF8SequenceCount++;
  199. suspiciousUTF8BytesTotal += lengthFound;
  200. skipUTF8Bytes = lengthFound - 1;
  201. }
  202. }
  203. else{
  204. skipUTF8Bytes--;
  205. }
  206. currentPos++;
  207. }
  208. // UTF-16 LE - in english / european environments, this is usually characterized by a
  209. // high proportion of odd binary nulls (starting at 0), with (as this is text) a low
  210. // proportion of even binary nulls.
  211. // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
  212. // 60% nulls where you do expect nulls) are completely arbitrary.
  213. if(
  214. ((evenBinaryNullsInSample * 2.0) / sampleBytes.length) < 0.2 &&
  215. ((oddBinaryNullsInSample * 2.0) / sampleBytes.length) > 0.6
  216. )
  217. return 'UTF-16LE';
  218. // UTF-16 BE - in english / european environments, this is usually characterized by a
  219. // high proportion of even binary nulls (starting at 0), with (as this is text) a low
  220. // proportion of odd binary nulls.
  221. // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
  222. // 60% nulls where you do expect nulls) are completely arbitrary.
  223. if(
  224. ((oddBinaryNullsInSample * 2.0) / sampleBytes.length) < 0.2 &&
  225. ((evenBinaryNullsInSample * 2.0) / sampleBytes.length) > 0.6
  226. )
  227. return 'UTF-16BE';
  228. // UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
  229. // using regexp, in his w3c.org unicode FAQ entry:
  230. // http://www.w3.org/International/questions/qa-forms-utf-8
  231. let potentiallyMangledString = sampleBytes.toString('ascii');
  232. let utf8Validator = '';
  233. utf8Validator += '^(';
  234. utf8Validator += '[\\x00-\\x7F]'; // ASCII
  235. utf8Validator += '|[\\xC2-\\xDF][\\x80-\\xBF]'; // non-overlong 2-byte
  236. utf8Validator += '|\\xE0[\\xA0-\\xBF][\\x80-\\xBF]'; // excluding overlongs
  237. utf8Validator += '|[\\xE1-\\xEC\\xEE\\xEF][\\x80-\\xBF]{2}'; // straight 3-byte
  238. utf8Validator += '|\\xED[\\x80-\\x9F][\\x80-\\xBF]'; // excluding surrogates
  239. utf8Validator += '|\\xF0[\\x90-\\xBF][\\x80-\\xBF]{2}'; // planes 1-3
  240. utf8Validator += '|[\\xF1-\\xF3][\\x80-\\xBF]{3}'; // planes 4-15
  241. utf8Validator += '|\\xF4[\\x80-\\x8F][\\x80-\\xBF]{2}'; // plane 16
  242. utf8Validator += ')*$';
  243. utf8Validator = new RegExp(utf8Validator);
  244. if(utf8Validator.test(potentiallyMangledString)){
  245. // Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
  246. // If all the characters are in the 0-127 range, no harm done, most western charsets are same
  247. // as UTF-8 in these ranges.
  248. // If some of the characters were in the upper range (western accented characters), however,
  249. // they would likely be mangled to 2-byte by the UTF-8 encoding process.
  250. // So, we need to play stats.
  251. // The "Random" likelihood of any pair of randomly generated characters being one
  252. // of these "suspicious" character sequences is:
  253. // 128 / (256 * 256) = 0.2%.
  254. //
  255. // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
  256. // character range, so we assume that more than 1 in 500,000 of these character
  257. // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
  258. //
  259. // We can only assume these character sequences will be rare if we ALSO assume that this
  260. // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
  261. // not already suspicious sequences) should be plain US-ASCII bytes. This, I
  262. // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
  263. // approx 40%, so the chances of hitting this threshold by accident in random data are
  264. // VERY low).
  265. if
  266. (
  267. (suspiciousUTF8SequenceCount * 500000.0 / sampleBytes.length >= 1) && //suspicious sequences
  268. ( //all suspicious, so cannot evaluate proportion of US-Ascii
  269. (sampleBytes.length - suspiciousUTF8BytesTotal === 0) ||
  270. likelyUSASCIIBytesInSample * 1.0 / (sampleBytes.length - suspiciousUTF8BytesTotal) >= 0.8
  271. )
  272. )
  273. return 'UTF-8';
  274. }
  275. return null;
  276. }
  277. function detectSuspiciousUTF8SequenceLength(sampleBytes, currentPos){
  278. let lengthFound = 0;
  279. if(
  280. sampleBytes.length >= currentPos + 1 &&
  281. sampleBytes[currentPos] === 0xC2
  282. ){
  283. if(
  284. sampleBytes[currentPos + 1] === 0x81 ||
  285. sampleBytes[currentPos + 1] === 0x8D ||
  286. sampleBytes[currentPos + 1] === 0x8F
  287. )
  288. lengthFound = 2;
  289. else
  290. if(
  291. sampleBytes[currentPos + 1] === 0x90 ||
  292. sampleBytes[currentPos + 1] === 0x9D
  293. )
  294. lengthFound = 2;
  295. else
  296. if(
  297. sampleBytes[currentPos + 1] >= 0xA0 &&
  298. sampleBytes[currentPos + 1] <= 0xBF
  299. )
  300. lengthFound = 2;
  301. }
  302. else
  303. if(
  304. sampleBytes.length >= currentPos + 1 &&
  305. sampleBytes[currentPos] === 0xC3
  306. ){
  307. if(
  308. sampleBytes[currentPos + 1] >= 0x80 &&
  309. sampleBytes[currentPos + 1] <= 0xBF
  310. )
  311. lengthFound = 2;
  312. }
  313. else
  314. if(
  315. sampleBytes.length >= currentPos + 1 &&
  316. sampleBytes[currentPos] === 0xC5
  317. ){
  318. if(
  319. sampleBytes[currentPos + 1] === 0x92 ||
  320. sampleBytes[currentPos + 1] === 0x93
  321. )
  322. lengthFound = 2;
  323. else
  324. if(
  325. sampleBytes[currentPos + 1] === 0xA0 ||
  326. sampleBytes[currentPos + 1] === 0xA1
  327. )
  328. lengthFound = 2;
  329. else
  330. if(
  331. sampleBytes[currentPos + 1] === 0xB8 ||
  332. sampleBytes[currentPos + 1] === 0xBD ||
  333. sampleBytes[currentPos + 1] === 0xBE
  334. )
  335. lengthFound = 2;
  336. }
  337. else
  338. if(
  339. sampleBytes.length >= currentPos + 1 &&
  340. sampleBytes[currentPos] === 0xC6
  341. ){
  342. if(sampleBytes[currentPos + 1] === 0x92)
  343. lengthFound = 2;
  344. }
  345. else
  346. if(
  347. sampleBytes.length >= currentPos + 1 &&
  348. sampleBytes[currentPos] === 0xCB
  349. ){
  350. if(
  351. sampleBytes[currentPos + 1] === 0x86 ||
  352. sampleBytes[currentPos + 1] === 0x9C
  353. )
  354. lengthFound = 2;
  355. }
  356. else
  357. if(
  358. sampleBytes.length >= currentPos + 2 &&
  359. sampleBytes[currentPos] === 0xE2
  360. ){
  361. if(sampleBytes[currentPos + 1] === 0x80){
  362. if(
  363. sampleBytes[currentPos + 2] === 0x93 ||
  364. sampleBytes[currentPos + 2] === 0x94
  365. )
  366. lengthFound = 3;
  367. if(
  368. sampleBytes[currentPos + 2] === 0x98 ||
  369. sampleBytes[currentPos + 2] === 0x99 ||
  370. sampleBytes[currentPos + 2] === 0x9A
  371. )
  372. lengthFound = 3;
  373. if(
  374. sampleBytes[currentPos + 2] === 0x9C ||
  375. sampleBytes[currentPos + 2] === 0x9D ||
  376. sampleBytes[currentPos + 2] === 0x9E
  377. )
  378. lengthFound = 3;
  379. if(
  380. sampleBytes[currentPos + 2] === 0xA0 ||
  381. sampleBytes[currentPos + 2] === 0xA1 ||
  382. sampleBytes[currentPos + 2] === 0xA2
  383. )
  384. lengthFound = 3;
  385. if(sampleBytes[currentPos + 2] === 0xA6)
  386. lengthFound = 3;
  387. if(sampleBytes[currentPos + 2] === 0xB0)
  388. lengthFound = 3;
  389. if(
  390. sampleBytes[currentPos + 2] === 0xB9 ||
  391. sampleBytes[currentPos + 2] === 0xBA
  392. )
  393. lengthFound = 3;
  394. }
  395. else
  396. if(
  397. sampleBytes[currentPos + 1] === 0x82 &&
  398. sampleBytes[currentPos + 2] === 0xAC
  399. )
  400. lengthFound = 3;
  401. else
  402. if(
  403. sampleBytes[currentPos + 1] === 0x84 &&
  404. sampleBytes[currentPos + 2] === 0xA2
  405. )
  406. lengthFound = 3;
  407. }
  408. return lengthFound;
  409. }
  410. function detectBOMBytes(bomBytes){
  411. if (bomBytes.length < 2) return null;
  412. // UTF-16LE - Unicode UTF-16 little endian byte order
  413. if (
  414. bomBytes[0] === 0xFF &&
  415. bomBytes[1] === 0xFE &&
  416. (bomBytes.length < 4 || bomBytes[2] != 0x00 || bomBytes[3] != 0x00)
  417. )
  418. return 'UTF-16LE';
  419. // UTF-16BE - Unicode UTF-16 big endian byte order
  420. if (bomBytes[0] === 0xFE && bomBytes[1] === 0xFF)
  421. return 'UTF-16BE';
  422. if (bomBytes.length < 3) return null;
  423. // UTF-8
  424. if (bomBytes[0] === 0xEF && bomBytes[1] === 0xBB && bomBytes[2] === 0xBF)
  425. return 'UTF-8';
  426. // Character encodings such as UTF-7 that make overloaded usage of ASCII-valued
  427. // bytes may fail to be reliably detected
  428. if (bomBytes[0] === 0x2B && bomBytes[1] === 0x2F && bomBytes[2] === 0x76)
  429. return 'UTF-7';
  430. if (bomBytes.length < 4) return null;
  431. // UTF-32LE - Unicode UTF-32 little endian byte order
  432. if (bomBytes[0] === 0xFF && bomBytes[1] === 0xFE && bomBytes[2] === 0x00 && bomBytes[3] === 0x00)
  433. return 'UTF-32LE';
  434. // UTF-32BE - Unicode UTF-32 big endian byte order
  435. if (bomBytes[0] === 0x00 && bomBytes[1] === 0x00 && bomBytes[2] === 0xFE && bomBytes[3] === 0xFF)
  436. return 'UTF-32BE';
  437. return null;
  438. }
  439. function isCommonUSASCIIByte(byte){
  440. return (
  441. byte === 0x0A || //lf
  442. byte === 0x0D || //cr
  443. byte === 0x09 || //tab
  444. (byte >= 0x20 && byte <= 0x2F) || //common punctuation
  445. (byte >= 0x30 && byte <= 0x39) || //digits
  446. (byte >= 0x3A && byte <= 0x40) || //common punctuation
  447. (byte >= 0x41 && byte <= 0x5A) || //capital letters
  448. (byte >= 0x5B && byte <= 0x60) || //common punctuation
  449. (byte >= 0x61 && byte <= 0x7A) || //lowercase letters
  450. (byte >= 0x7B && byte <= 0x7E) //common punctuation
  451. );
  452. }

A Node.js implementation of the WHATWG XMLHttpRequest Living Standard for non-browser environments.