blob: ff57be4cb3b080b4da1a5542dcdd46d0f7d7ff8a [file] [log] [blame]
bigbiff bigbiffe60683a2013-02-22 20:55:50 -05001
2/*
3 * encode.c - string conversion routines (mostly for compatibility with
4 * udev/volume_id)
5 *
6 * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
7 * Copyright (C) 2009 Karel Zak <kzak@redhat.com>
8 *
9 * This file may be redistributed under the terms of the
10 * GNU Lesser General Public License.
11 */
12#include <stdio.h>
13#include <stdlib.h>
14#include <stddef.h>
15#include <unistd.h>
16#include <errno.h>
17#include <string.h>
18#include <ctype.h>
19
20#include "blkidP.h"
21
22#define UDEV_ALLOWED_CHARS_INPUT "/ $%?,"
23
24/**
25 * SECTION: encode
26 * @title: Encoding utils
27 * @short_description: encode strings to safe udev-compatible formats
28 *
29 */
30
31/* count of characters used to encode one unicode char */
32static int utf8_encoded_expected_len(const char *str)
33{
34 unsigned char c = (unsigned char)str[0];
35
36 if (c < 0x80)
37 return 1;
38 if ((c & 0xe0) == 0xc0)
39 return 2;
40 if ((c & 0xf0) == 0xe0)
41 return 3;
42 if ((c & 0xf8) == 0xf0)
43 return 4;
44 if ((c & 0xfc) == 0xf8)
45 return 5;
46 if ((c & 0xfe) == 0xfc)
47 return 6;
48 return 0;
49}
50
51/* decode one unicode char */
52static int utf8_encoded_to_unichar(const char *str)
53{
54 int unichar;
55 int len;
56 int i;
57
58 len = utf8_encoded_expected_len(str);
59 switch (len) {
60 case 1:
61 return (int)str[0];
62 case 2:
63 unichar = str[0] & 0x1f;
64 break;
65 case 3:
66 unichar = (int)str[0] & 0x0f;
67 break;
68 case 4:
69 unichar = (int)str[0] & 0x07;
70 break;
71 case 5:
72 unichar = (int)str[0] & 0x03;
73 break;
74 case 6:
75 unichar = (int)str[0] & 0x01;
76 break;
77 default:
78 return -1;
79 }
80
81 for (i = 1; i < len; i++) {
82 if (((int)str[i] & 0xc0) != 0x80)
83 return -1;
84 unichar <<= 6;
85 unichar |= (int)str[i] & 0x3f;
86 }
87
88 return unichar;
89}
90
91/* expected size used to encode one unicode char */
92static int utf8_unichar_to_encoded_len(int unichar)
93{
94 if (unichar < 0x80)
95 return 1;
96 if (unichar < 0x800)
97 return 2;
98 if (unichar < 0x10000)
99 return 3;
100 if (unichar < 0x200000)
101 return 4;
102 if (unichar < 0x4000000)
103 return 5;
104 return 6;
105}
106
107/* check if unicode char has a valid numeric range */
108static int utf8_unichar_valid_range(int unichar)
109{
110 if (unichar > 0x10ffff)
111 return 0;
112 if ((unichar & 0xfffff800) == 0xd800)
113 return 0;
114 if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
115 return 0;
116 if ((unichar & 0xffff) == 0xffff)
117 return 0;
118 return 1;
119}
120
121/* validate one encoded unicode char and return its length */
122static int utf8_encoded_valid_unichar(const char *str)
123{
124 int len;
125 int unichar;
126 int i;
127
128 len = utf8_encoded_expected_len(str);
129 if (len == 0)
130 return -1;
131
132 /* ascii is valid */
133 if (len == 1)
134 return 1;
135
136 /* check if expected encoded chars are available */
137 for (i = 0; i < len; i++)
138 if ((str[i] & 0x80) != 0x80)
139 return -1;
140
141 unichar = utf8_encoded_to_unichar(str);
142
143 /* check if encoded length matches encoded value */
144 if (utf8_unichar_to_encoded_len(unichar) != len)
145 return -1;
146
147 /* check if value has valid range */
148 if (!utf8_unichar_valid_range(unichar))
149 return -1;
150
151 return len;
152}
153
154static int replace_whitespace(const char *str, char *to, size_t len)
155{
156 size_t i, j;
157
158 /* strip trailing whitespace */
159 len = strnlen(str, len);
160 while (len && isspace(str[len-1]))
161 len--;
162
163 /* strip leading whitespace */
164 i = 0;
165 while (isspace(str[i]) && (i < len))
166 i++;
167
168 j = 0;
169 while (i < len) {
170 /* substitute multiple whitespace with a single '_' */
171 if (isspace(str[i])) {
172 while (isspace(str[i]))
173 i++;
174 to[j++] = '_';
175 }
176 to[j++] = str[i++];
177 }
178 to[j] = '\0';
179 return 0;
180}
181
182static int is_whitelisted(char c, const char *white)
183{
184 if ((c >= '0' && c <= '9') ||
185 (c >= 'A' && c <= 'Z') ||
186 (c >= 'a' && c <= 'z') ||
187 strchr("#+-.:=@_", c) != NULL ||
188 (white != NULL && strchr(white, c) != NULL))
189 return 1;
190 return 0;
191}
192
193/* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
194static int replace_chars(char *str, const char *white)
195{
196 size_t i = 0;
197 int replaced = 0;
198
199 while (str[i] != '\0') {
200 int len;
201
202 if (is_whitelisted(str[i], white)) {
203 i++;
204 continue;
205 }
206
207 /* accept hex encoding */
208 if (str[i] == '\\' && str[i+1] == 'x') {
209 i += 2;
210 continue;
211 }
212
213 /* accept valid utf8 */
214 len = utf8_encoded_valid_unichar(&str[i]);
215 if (len > 1) {
216 i += len;
217 continue;
218 }
219
220 /* if space is allowed, replace whitespace with ordinary space */
221 if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) {
222 str[i] = ' ';
223 i++;
224 replaced++;
225 continue;
226 }
227
228 /* everything else is replaced with '_' */
229 str[i] = '_';
230 i++;
231 replaced++;
232 }
233 return replaced;
234}
235
236size_t blkid_encode_to_utf8(int enc, unsigned char *dest, size_t len,
237 const unsigned char *src, size_t count)
238{
239 size_t i, j;
240 uint16_t c;
241
242 for (j = i = 0; i + 2 <= count; i += 2) {
243 if (enc == BLKID_ENC_UTF16LE)
244 c = (src[i+1] << 8) | src[i];
245 else /* BLKID_ENC_UTF16BE */
246 c = (src[i] << 8) | src[i+1];
247 if (c == 0) {
248 dest[j] = '\0';
249 break;
250 } else if (c < 0x80) {
251 if (j+1 >= len)
252 break;
253 dest[j++] = (uint8_t) c;
254 } else if (c < 0x800) {
255 if (j+2 >= len)
256 break;
257 dest[j++] = (uint8_t) (0xc0 | (c >> 6));
258 dest[j++] = (uint8_t) (0x80 | (c & 0x3f));
259 } else {
260 if (j+3 >= len)
261 break;
262 dest[j++] = (uint8_t) (0xe0 | (c >> 12));
263 dest[j++] = (uint8_t) (0x80 | ((c >> 6) & 0x3f));
264 dest[j++] = (uint8_t) (0x80 | (c & 0x3f));
265 }
266 }
267 dest[j] = '\0';
268 return j;
269}
270
271/**
272 * blkid_encode_string:
273 * @str: input string to be encoded
274 * @str_enc: output string to store the encoded input string
275 * @len: maximum size of the output string, which may be
276 * four times as long as the input string
277 *
278 * Encode all potentially unsafe characters of a string to the
279 * corresponding hex value prefixed by '\x'.
280 *
281 * Returns: 0 if the entire string was copied, non-zero otherwise.
282 **/
283int blkid_encode_string(const char *str, char *str_enc, size_t len)
284{
285 size_t i, j;
286
287 if (!str || !str_enc || !len)
288 return -1;
289
290 for (i = 0, j = 0; str[i] != '\0'; i++) {
291 int seqlen;
292
293 seqlen = utf8_encoded_valid_unichar(&str[i]);
294 if (seqlen > 1) {
295 if (len-j < (size_t)seqlen)
296 goto err;
297 memcpy(&str_enc[j], &str[i], seqlen);
298 j += seqlen;
299 i += (seqlen-1);
300 } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
301 if (len-j < 4)
302 goto err;
303 sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
304 j += 4;
305 } else {
306 if (len-j < 1)
307 goto err;
308 str_enc[j] = str[i];
309 j++;
310 }
311 if (j+3 >= len)
312 goto err;
313 }
314 if (len-j < 1)
315 goto err;
316 str_enc[j] = '\0';
317 return 0;
318err:
319 return -1;
320}
321
322/**
323 * blkid_safe_string:
324 * @str: input string
325 * @str_safe: output string
326 * @len: size of output string
327 *
328 * Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces
329 * with '_'.
330 *
331 * Returns: 0 on success or -1 in case of error.
332 */
333int blkid_safe_string(const char *str, char *str_safe, size_t len)
334{
335 if (!str || !str_safe || !len)
336 return -1;
337 replace_whitespace(str, str_safe, len);
338 replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT);
339 return 0;
340}