utf8.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. /**
  2. * Using strings in Ethereum (or any security-basd system) requires
  3. * additional care. These utilities attempt to mitigate some of the
  4. * safety issues as well as provide the ability to recover and analyse
  5. * strings.
  6. *
  7. * @_subsection api/utils:Strings and UTF-8 [about-strings]
  8. */
  9. import { getBytes } from "./data.js";
  10. import { assertArgument, assertNormalize } from "./errors.js";
  11. import type { BytesLike } from "./index.js";
  12. ///////////////////////////////
  13. /**
  14. * The stanard normalization forms.
  15. */
  16. export type UnicodeNormalizationForm = "NFC" | "NFD" | "NFKC" | "NFKD";
  17. /**
  18. * When using the UTF-8 error API the following errors can be intercepted
  19. * and processed as the %%reason%% passed to the [[Utf8ErrorFunc]].
  20. *
  21. * **``"UNEXPECTED_CONTINUE"``** - a continuation byte was present where there
  22. * was nothing to continue.
  23. *
  24. * **``"BAD_PREFIX"``** - an invalid (non-continuation) byte to start a
  25. * UTF-8 codepoint was found.
  26. *
  27. * **``"OVERRUN"``** - the string is too short to process the expected
  28. * codepoint length.
  29. *
  30. * **``"MISSING_CONTINUE"``** - a missing continuation byte was expected but
  31. * not found. The %%offset%% indicates the index the continuation byte
  32. * was expected at.
  33. *
  34. * **``"OUT_OF_RANGE"``** - the computed code point is outside the range
  35. * for UTF-8. The %%badCodepoint%% indicates the computed codepoint, which was
  36. * outside the valid UTF-8 range.
  37. *
  38. * **``"UTF16_SURROGATE"``** - the UTF-8 strings contained a UTF-16 surrogate
  39. * pair. The %%badCodepoint%% is the computed codepoint, which was inside the
  40. * UTF-16 surrogate range.
  41. *
  42. * **``"OVERLONG"``** - the string is an overlong representation. The
  43. * %%badCodepoint%% indicates the computed codepoint, which has already
  44. * been bounds checked.
  45. *
  46. *
  47. * @returns string
  48. */
  49. export type Utf8ErrorReason = "UNEXPECTED_CONTINUE" | "BAD_PREFIX" | "OVERRUN" |
  50. "MISSING_CONTINUE" | "OUT_OF_RANGE" | "UTF16_SURROGATE" | "OVERLONG";
  51. /**
  52. * A callback that can be used with [[toUtf8String]] to analysis or
  53. * recovery from invalid UTF-8 data.
  54. *
  55. * Parsing UTF-8 data is done through a simple Finite-State Machine (FSM)
  56. * which calls the ``Utf8ErrorFunc`` if a fault is detected.
  57. *
  58. * The %%reason%% indicates where in the FSM execution the fault
  59. * occurred and the %%offset%% indicates where the input failed.
  60. *
  61. * The %%bytes%% represents the raw UTF-8 data that was provided and
  62. * %%output%% is the current array of UTF-8 code-points, which may
  63. * be updated by the ``Utf8ErrorFunc``.
  64. *
  65. * The value of the %%badCodepoint%% depends on the %%reason%%. See
  66. * [[Utf8ErrorReason]] for details.
  67. *
  68. * The function should return the number of bytes that should be skipped
  69. * when control resumes to the FSM.
  70. */
  71. export type Utf8ErrorFunc = (reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number) => number;
  72. function errorFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
  73. assertArgument(false, `invalid codepoint at offset ${ offset }; ${ reason }`, "bytes", bytes);
  74. }
  75. function ignoreFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
  76. // If there is an invalid prefix (including stray continuation), skip any additional continuation bytes
  77. if (reason === "BAD_PREFIX" || reason === "UNEXPECTED_CONTINUE") {
  78. let i = 0;
  79. for (let o = offset + 1; o < bytes.length; o++) {
  80. if (bytes[o] >> 6 !== 0x02) { break; }
  81. i++;
  82. }
  83. return i;
  84. }
  85. // This byte runs us past the end of the string, so just jump to the end
  86. // (but the first byte was read already read and therefore skipped)
  87. if (reason === "OVERRUN") {
  88. return bytes.length - offset - 1;
  89. }
  90. // Nothing to skip
  91. return 0;
  92. }
  93. function replaceFunc(reason: Utf8ErrorReason, offset: number, bytes: Uint8Array, output: Array<number>, badCodepoint?: number): number {
  94. // Overlong representations are otherwise "valid" code points; just non-deistingtished
  95. if (reason === "OVERLONG") {
  96. assertArgument(typeof(badCodepoint) === "number", "invalid bad code point for replacement", "badCodepoint", badCodepoint);
  97. output.push(badCodepoint);
  98. return 0;
  99. }
  100. // Put the replacement character into the output
  101. output.push(0xfffd);
  102. // Otherwise, process as if ignoring errors
  103. return ignoreFunc(reason, offset, bytes, output, badCodepoint);
  104. }
  105. /**
  106. * A handful of popular, built-in UTF-8 error handling strategies.
  107. *
  108. * **``"error"``** - throws on ANY illegal UTF-8 sequence or
  109. * non-canonical (overlong) codepoints (this is the default)
  110. *
  111. * **``"ignore"``** - silently drops any illegal UTF-8 sequence
  112. * and accepts non-canonical (overlong) codepoints
  113. *
  114. * **``"replace"``** - replace any illegal UTF-8 sequence with the
  115. * UTF-8 replacement character (i.e. ``"\\ufffd"``) and accepts
  116. * non-canonical (overlong) codepoints
  117. *
  118. * @returns: Record<"error" | "ignore" | "replace", Utf8ErrorFunc>
  119. */
  120. export const Utf8ErrorFuncs: Readonly<Record<"error" | "ignore" | "replace", Utf8ErrorFunc>> = Object.freeze({
  121. error: errorFunc,
  122. ignore: ignoreFunc,
  123. replace: replaceFunc
  124. });
  125. // http://stackoverflow.com/questions/13356493/decode-utf-8-with-javascript#13691499
  126. function getUtf8CodePoints(_bytes: BytesLike, onError?: Utf8ErrorFunc): Array<number> {
  127. if (onError == null) { onError = Utf8ErrorFuncs.error; }
  128. const bytes = getBytes(_bytes, "bytes");
  129. const result: Array<number> = [];
  130. let i = 0;
  131. // Invalid bytes are ignored
  132. while(i < bytes.length) {
  133. const c = bytes[i++];
  134. // 0xxx xxxx
  135. if (c >> 7 === 0) {
  136. result.push(c);
  137. continue;
  138. }
  139. // Multibyte; how many bytes left for this character?
  140. let extraLength: null | number = null;
  141. let overlongMask: null | number = null;
  142. // 110x xxxx 10xx xxxx
  143. if ((c & 0xe0) === 0xc0) {
  144. extraLength = 1;
  145. overlongMask = 0x7f;
  146. // 1110 xxxx 10xx xxxx 10xx xxxx
  147. } else if ((c & 0xf0) === 0xe0) {
  148. extraLength = 2;
  149. overlongMask = 0x7ff;
  150. // 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
  151. } else if ((c & 0xf8) === 0xf0) {
  152. extraLength = 3;
  153. overlongMask = 0xffff;
  154. } else {
  155. if ((c & 0xc0) === 0x80) {
  156. i += onError("UNEXPECTED_CONTINUE", i - 1, bytes, result);
  157. } else {
  158. i += onError("BAD_PREFIX", i - 1, bytes, result);
  159. }
  160. continue;
  161. }
  162. // Do we have enough bytes in our data?
  163. if (i - 1 + extraLength >= bytes.length) {
  164. i += onError("OVERRUN", i - 1, bytes, result);
  165. continue;
  166. }
  167. // Remove the length prefix from the char
  168. let res: null | number = c & ((1 << (8 - extraLength - 1)) - 1);
  169. for (let j = 0; j < extraLength; j++) {
  170. let nextChar = bytes[i];
  171. // Invalid continuation byte
  172. if ((nextChar & 0xc0) != 0x80) {
  173. i += onError("MISSING_CONTINUE", i, bytes, result);
  174. res = null;
  175. break;
  176. };
  177. res = (res << 6) | (nextChar & 0x3f);
  178. i++;
  179. }
  180. // See above loop for invalid continuation byte
  181. if (res === null) { continue; }
  182. // Maximum code point
  183. if (res > 0x10ffff) {
  184. i += onError("OUT_OF_RANGE", i - 1 - extraLength, bytes, result, res);
  185. continue;
  186. }
  187. // Reserved for UTF-16 surrogate halves
  188. if (res >= 0xd800 && res <= 0xdfff) {
  189. i += onError("UTF16_SURROGATE", i - 1 - extraLength, bytes, result, res);
  190. continue;
  191. }
  192. // Check for overlong sequences (more bytes than needed)
  193. if (res <= overlongMask) {
  194. i += onError("OVERLONG", i - 1 - extraLength, bytes, result, res);
  195. continue;
  196. }
  197. result.push(res);
  198. }
  199. return result;
  200. }
  201. // http://stackoverflow.com/questions/18729405/how-to-convert-utf8-string-to-byte-array
  202. /**
  203. * Returns the UTF-8 byte representation of %%str%%.
  204. *
  205. * If %%form%% is specified, the string is normalized.
  206. */
  207. export function toUtf8Bytes(str: string, form?: UnicodeNormalizationForm): Uint8Array {
  208. assertArgument(typeof(str) === "string", "invalid string value", "str", str);
  209. if (form != null) {
  210. assertNormalize(form);
  211. str = str.normalize(form);
  212. }
  213. let result: Array<number> = [];
  214. for (let i = 0; i < str.length; i++) {
  215. const c = str.charCodeAt(i);
  216. if (c < 0x80) {
  217. result.push(c);
  218. } else if (c < 0x800) {
  219. result.push((c >> 6) | 0xc0);
  220. result.push((c & 0x3f) | 0x80);
  221. } else if ((c & 0xfc00) == 0xd800) {
  222. i++;
  223. const c2 = str.charCodeAt(i);
  224. assertArgument(i < str.length && ((c2 & 0xfc00) === 0xdc00),
  225. "invalid surrogate pair", "str", str);
  226. // Surrogate Pair
  227. const pair = 0x10000 + ((c & 0x03ff) << 10) + (c2 & 0x03ff);
  228. result.push((pair >> 18) | 0xf0);
  229. result.push(((pair >> 12) & 0x3f) | 0x80);
  230. result.push(((pair >> 6) & 0x3f) | 0x80);
  231. result.push((pair & 0x3f) | 0x80);
  232. } else {
  233. result.push((c >> 12) | 0xe0);
  234. result.push(((c >> 6) & 0x3f) | 0x80);
  235. result.push((c & 0x3f) | 0x80);
  236. }
  237. }
  238. return new Uint8Array(result);
  239. };
  240. //export
  241. function _toUtf8String(codePoints: Array<number>): string {
  242. return codePoints.map((codePoint) => {
  243. if (codePoint <= 0xffff) {
  244. return String.fromCharCode(codePoint);
  245. }
  246. codePoint -= 0x10000;
  247. return String.fromCharCode(
  248. (((codePoint >> 10) & 0x3ff) + 0xd800),
  249. ((codePoint & 0x3ff) + 0xdc00)
  250. );
  251. }).join("");
  252. }
  253. /**
  254. * Returns the string represented by the UTF-8 data %%bytes%%.
  255. *
  256. * When %%onError%% function is specified, it is called on UTF-8
  257. * errors allowing recovery using the [[Utf8ErrorFunc]] API.
  258. * (default: [error](Utf8ErrorFuncs))
  259. */
  260. export function toUtf8String(bytes: BytesLike, onError?: Utf8ErrorFunc): string {
  261. return _toUtf8String(getUtf8CodePoints(bytes, onError));
  262. }
  263. /**
  264. * Returns the UTF-8 code-points for %%str%%.
  265. *
  266. * If %%form%% is specified, the string is normalized.
  267. */
  268. export function toUtf8CodePoints(str: string, form?: UnicodeNormalizationForm): Array<number> {
  269. return getUtf8CodePoints(toUtf8Bytes(str, form));
  270. }