bigbiff bigbiff | e60683a | 2013-02-22 20:55:50 -0500 | [diff] [blame] | 1 | |
| 2 | /* |
| 3 | * encode.c - string conversion routines (mostly for compatibility with |
| 4 | * udev/volume_id) |
| 5 | * |
| 6 | * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org> |
| 7 | * Copyright (C) 2009 Karel Zak <kzak@redhat.com> |
| 8 | * |
| 9 | * This file may be redistributed under the terms of the |
| 10 | * GNU Lesser General Public License. |
| 11 | */ |
| 12 | #include <stdio.h> |
| 13 | #include <stdlib.h> |
| 14 | #include <stddef.h> |
| 15 | #include <unistd.h> |
| 16 | #include <errno.h> |
| 17 | #include <string.h> |
| 18 | #include <ctype.h> |
| 19 | |
| 20 | #include "blkidP.h" |
| 21 | |
| 22 | #define UDEV_ALLOWED_CHARS_INPUT "/ $%?," |
| 23 | |
| 24 | /** |
| 25 | * SECTION: encode |
| 26 | * @title: Encoding utils |
| 27 | * @short_description: encode strings to safe udev-compatible formats |
| 28 | * |
| 29 | */ |
| 30 | |
| 31 | /* count of characters used to encode one unicode char */ |
| 32 | static int utf8_encoded_expected_len(const char *str) |
| 33 | { |
| 34 | unsigned char c = (unsigned char)str[0]; |
| 35 | |
| 36 | if (c < 0x80) |
| 37 | return 1; |
| 38 | if ((c & 0xe0) == 0xc0) |
| 39 | return 2; |
| 40 | if ((c & 0xf0) == 0xe0) |
| 41 | return 3; |
| 42 | if ((c & 0xf8) == 0xf0) |
| 43 | return 4; |
| 44 | if ((c & 0xfc) == 0xf8) |
| 45 | return 5; |
| 46 | if ((c & 0xfe) == 0xfc) |
| 47 | return 6; |
| 48 | return 0; |
| 49 | } |
| 50 | |
| 51 | /* decode one unicode char */ |
| 52 | static int utf8_encoded_to_unichar(const char *str) |
| 53 | { |
| 54 | int unichar; |
| 55 | int len; |
| 56 | int i; |
| 57 | |
| 58 | len = utf8_encoded_expected_len(str); |
| 59 | switch (len) { |
| 60 | case 1: |
| 61 | return (int)str[0]; |
| 62 | case 2: |
| 63 | unichar = str[0] & 0x1f; |
| 64 | break; |
| 65 | case 3: |
| 66 | unichar = (int)str[0] & 0x0f; |
| 67 | break; |
| 68 | case 4: |
| 69 | unichar = (int)str[0] & 0x07; |
| 70 | break; |
| 71 | case 5: |
| 72 | unichar = (int)str[0] & 0x03; |
| 73 | break; |
| 74 | case 6: |
| 75 | unichar = (int)str[0] & 0x01; |
| 76 | break; |
| 77 | default: |
| 78 | return -1; |
| 79 | } |
| 80 | |
| 81 | for (i = 1; i < len; i++) { |
| 82 | if (((int)str[i] & 0xc0) != 0x80) |
| 83 | return -1; |
| 84 | unichar <<= 6; |
| 85 | unichar |= (int)str[i] & 0x3f; |
| 86 | } |
| 87 | |
| 88 | return unichar; |
| 89 | } |
| 90 | |
| 91 | /* expected size used to encode one unicode char */ |
| 92 | static int utf8_unichar_to_encoded_len(int unichar) |
| 93 | { |
| 94 | if (unichar < 0x80) |
| 95 | return 1; |
| 96 | if (unichar < 0x800) |
| 97 | return 2; |
| 98 | if (unichar < 0x10000) |
| 99 | return 3; |
| 100 | if (unichar < 0x200000) |
| 101 | return 4; |
| 102 | if (unichar < 0x4000000) |
| 103 | return 5; |
| 104 | return 6; |
| 105 | } |
| 106 | |
| 107 | /* check if unicode char has a valid numeric range */ |
| 108 | static int utf8_unichar_valid_range(int unichar) |
| 109 | { |
| 110 | if (unichar > 0x10ffff) |
| 111 | return 0; |
| 112 | if ((unichar & 0xfffff800) == 0xd800) |
| 113 | return 0; |
| 114 | if ((unichar > 0xfdcf) && (unichar < 0xfdf0)) |
| 115 | return 0; |
| 116 | if ((unichar & 0xffff) == 0xffff) |
| 117 | return 0; |
| 118 | return 1; |
| 119 | } |
| 120 | |
| 121 | /* validate one encoded unicode char and return its length */ |
| 122 | static int utf8_encoded_valid_unichar(const char *str) |
| 123 | { |
| 124 | int len; |
| 125 | int unichar; |
| 126 | int i; |
| 127 | |
| 128 | len = utf8_encoded_expected_len(str); |
| 129 | if (len == 0) |
| 130 | return -1; |
| 131 | |
| 132 | /* ascii is valid */ |
| 133 | if (len == 1) |
| 134 | return 1; |
| 135 | |
| 136 | /* check if expected encoded chars are available */ |
| 137 | for (i = 0; i < len; i++) |
| 138 | if ((str[i] & 0x80) != 0x80) |
| 139 | return -1; |
| 140 | |
| 141 | unichar = utf8_encoded_to_unichar(str); |
| 142 | |
| 143 | /* check if encoded length matches encoded value */ |
| 144 | if (utf8_unichar_to_encoded_len(unichar) != len) |
| 145 | return -1; |
| 146 | |
| 147 | /* check if value has valid range */ |
| 148 | if (!utf8_unichar_valid_range(unichar)) |
| 149 | return -1; |
| 150 | |
| 151 | return len; |
| 152 | } |
| 153 | |
| 154 | static int replace_whitespace(const char *str, char *to, size_t len) |
| 155 | { |
| 156 | size_t i, j; |
| 157 | |
| 158 | /* strip trailing whitespace */ |
| 159 | len = strnlen(str, len); |
| 160 | while (len && isspace(str[len-1])) |
| 161 | len--; |
| 162 | |
| 163 | /* strip leading whitespace */ |
| 164 | i = 0; |
| 165 | while (isspace(str[i]) && (i < len)) |
| 166 | i++; |
| 167 | |
| 168 | j = 0; |
| 169 | while (i < len) { |
| 170 | /* substitute multiple whitespace with a single '_' */ |
| 171 | if (isspace(str[i])) { |
| 172 | while (isspace(str[i])) |
| 173 | i++; |
| 174 | to[j++] = '_'; |
| 175 | } |
| 176 | to[j++] = str[i++]; |
| 177 | } |
| 178 | to[j] = '\0'; |
| 179 | return 0; |
| 180 | } |
| 181 | |
| 182 | static int is_whitelisted(char c, const char *white) |
| 183 | { |
| 184 | if ((c >= '0' && c <= '9') || |
| 185 | (c >= 'A' && c <= 'Z') || |
| 186 | (c >= 'a' && c <= 'z') || |
| 187 | strchr("#+-.:=@_", c) != NULL || |
| 188 | (white != NULL && strchr(white, c) != NULL)) |
| 189 | return 1; |
| 190 | return 0; |
| 191 | } |
| 192 | |
| 193 | /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */ |
| 194 | static int replace_chars(char *str, const char *white) |
| 195 | { |
| 196 | size_t i = 0; |
| 197 | int replaced = 0; |
| 198 | |
| 199 | while (str[i] != '\0') { |
| 200 | int len; |
| 201 | |
| 202 | if (is_whitelisted(str[i], white)) { |
| 203 | i++; |
| 204 | continue; |
| 205 | } |
| 206 | |
| 207 | /* accept hex encoding */ |
| 208 | if (str[i] == '\\' && str[i+1] == 'x') { |
| 209 | i += 2; |
| 210 | continue; |
| 211 | } |
| 212 | |
| 213 | /* accept valid utf8 */ |
| 214 | len = utf8_encoded_valid_unichar(&str[i]); |
| 215 | if (len > 1) { |
| 216 | i += len; |
| 217 | continue; |
| 218 | } |
| 219 | |
| 220 | /* if space is allowed, replace whitespace with ordinary space */ |
| 221 | if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) { |
| 222 | str[i] = ' '; |
| 223 | i++; |
| 224 | replaced++; |
| 225 | continue; |
| 226 | } |
| 227 | |
| 228 | /* everything else is replaced with '_' */ |
| 229 | str[i] = '_'; |
| 230 | i++; |
| 231 | replaced++; |
| 232 | } |
| 233 | return replaced; |
| 234 | } |
| 235 | |
| 236 | size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len, |
| 237 | const unsigned char *src, size_t count) |
| 238 | { |
| 239 | size_t i, j; |
| 240 | uint16_t c; |
| 241 | |
| 242 | for (j = i = 0; i + 2 <= count; i += 2) { |
| 243 | if (enc == BLKID_ENC_UTF16LE) |
| 244 | c = (src[i+1] << 8) | src[i]; |
| 245 | else /* BLKID_ENC_UTF16BE */ |
| 246 | c = (src[i] << 8) | src[i+1]; |
| 247 | if (c == 0) { |
| 248 | dest[j] = '\0'; |
| 249 | break; |
| 250 | } else if (c < 0x80) { |
| 251 | if (j+1 >= len) |
| 252 | break; |
| 253 | dest[j++] = (uint8_t) c; |
| 254 | } else if (c < 0x800) { |
| 255 | if (j+2 >= len) |
| 256 | break; |
| 257 | dest[j++] = (uint8_t) (0xc0 | (c >> 6)); |
| 258 | dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); |
| 259 | } else { |
| 260 | if (j+3 >= len) |
| 261 | break; |
| 262 | dest[j++] = (uint8_t) (0xe0 | (c >> 12)); |
| 263 | dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f)); |
| 264 | dest[j++] = (uint8_t) (0x80 | (c & 0x3f)); |
| 265 | } |
| 266 | } |
| 267 | dest[j] = '\0'; |
| 268 | return j; |
| 269 | } |
| 270 | |
| 271 | /** |
| 272 | * blkid_encode_string: |
| 273 | * @str: input string to be encoded |
| 274 | * @str_enc: output string to store the encoded input string |
| 275 | * @len: maximum size of the output string, which may be |
| 276 | * four times as long as the input string |
| 277 | * |
| 278 | * Encode all potentially unsafe characters of a string to the |
| 279 | * corresponding hex value prefixed by '\x'. |
| 280 | * |
| 281 | * Returns: 0 if the entire string was copied, non-zero otherwise. |
| 282 | **/ |
| 283 | int blkid_encode_string(const char *str, char *str_enc, size_t len) |
| 284 | { |
| 285 | size_t i, j; |
| 286 | |
| 287 | if (!str || !str_enc || !len) |
| 288 | return -1; |
| 289 | |
| 290 | for (i = 0, j = 0; str[i] != '\0'; i++) { |
| 291 | int seqlen; |
| 292 | |
| 293 | seqlen = utf8_encoded_valid_unichar(&str[i]); |
| 294 | if (seqlen > 1) { |
| 295 | if (len-j < (size_t)seqlen) |
| 296 | goto err; |
| 297 | memcpy(&str_enc[j], &str[i], seqlen); |
| 298 | j += seqlen; |
| 299 | i += (seqlen-1); |
| 300 | } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) { |
| 301 | if (len-j < 4) |
| 302 | goto err; |
| 303 | sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]); |
| 304 | j += 4; |
| 305 | } else { |
| 306 | if (len-j < 1) |
| 307 | goto err; |
| 308 | str_enc[j] = str[i]; |
| 309 | j++; |
| 310 | } |
| 311 | if (j+3 >= len) |
| 312 | goto err; |
| 313 | } |
| 314 | if (len-j < 1) |
| 315 | goto err; |
| 316 | str_enc[j] = '\0'; |
| 317 | return 0; |
| 318 | err: |
| 319 | return -1; |
| 320 | } |
| 321 | |
| 322 | /** |
| 323 | * blkid_safe_string: |
| 324 | * @str: input string |
| 325 | * @str_safe: output string |
| 326 | * @len: size of output string |
| 327 | * |
| 328 | * Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces |
| 329 | * with '_'. |
| 330 | * |
| 331 | * Returns: 0 on success or -1 in case of error. |
| 332 | */ |
| 333 | int blkid_safe_string(const char *str, char *str_safe, size_t len) |
| 334 | { |
| 335 | if (!str || !str_safe || !len) |
| 336 | return -1; |
| 337 | replace_whitespace(str, str_safe, len); |
| 338 | replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT); |
| 339 | return 0; |
| 340 | } |