This repository has been archived by the owner on Jul 6, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathclient_parser.c
127 lines (123 loc) · 3.09 KB
/
client_parser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <assert.h>
#include <err.h>
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "client_parser.h"
enum client_state {
s_text = 0,
s_utf8_continuation,
s_iac, /* TELNET IAC (\xff) */
s_iac_3byte, /* TELNET IAC + WILL/WONT/DO/DONT */
};
static enum client_state state = s_text;
static enum client_state stored_state = s_text;
static unsigned continuation_bytes_left = 0;
static uint32_t codepoint = 0;
/*
* Convert UTF-8 to ISO-8859-1, but pass telnet commands as-is.
* dst should be at least the same size as src (ie. len bytes long).
* Returns the number of bytes written to dst.
*/
size_t
client_utf8_to_iso8859_1(char *dst, const char *src, size_t len)
{
char *origdst = dst;
const char *p;
for (p = src; p < src + len; p++) {
unsigned char ch = *p;
/*
* TELNET IAC could occur at any time, even in the middle of a
* utf8 character's bytes (though that is maybe unlikely). So
* special-case it here and store current state.
*/
if (ch == 0xff && state != s_iac && state != s_iac_3byte) {
stored_state = state;
state = s_iac;
continue;
}
switch (state) {
case s_text:
if (ch < 0x80) {
*dst++ = ch;
} else if ((ch & 0xe0) == 0xc0) {
/* first 3 bits: 110 */
state = s_utf8_continuation;
continuation_bytes_left = 1;
/* 5 + 6 bits */
codepoint = (ch & 0x1f) << 6;
} else if ((ch & 0xf0) == 0xe0) {
/* first 4 bits: 1110 */
state = s_utf8_continuation;
continuation_bytes_left = 2;
/* 4 + 6 + 6 bits */
codepoint = (ch & 0xf) << 12;
} else if ((ch & 0xf8) == 0xf0) {
/* first 5 bits: 11110 */
state = s_utf8_continuation;
continuation_bytes_left = 3;
/* 3 + 6 + 6 + 6 bits */
codepoint = (ch & 0x7) << 18;
} else {
/* invalid byte */
*dst++ = '?';
state = s_text;
}
break;
case s_utf8_continuation:
if ((ch & 0xc0) != 0x80) {
/* first two bits are not 10; invalid
* continuation byte. */
*dst++ = '?';
codepoint = 0;
state = s_text;
continue;
}
continuation_bytes_left--;
/* 6 new bits */
uint32_t newbits = (ch & 0x3f);
codepoint |= (newbits << (continuation_bytes_left * 6));
if (!continuation_bytes_left) {
if (codepoint >= 0xa0 && codepoint <= 0xff)
*dst++ = codepoint;
else
*dst++ = '?';
codepoint = 0;
state = s_text;
}
break;
case s_iac:
if (ch == 0xff) {
/*
* TELNET-escaped 0xff byte - this makes no
* sense coming from the client as utf-8
* strings cannot contain 0xff (and the client
* does not send xterm control sequences or
* other binary to the server). Treat this as
* invalid byte.
*/
*dst++ = '?';
state = stored_state;
continue;
}
*dst++ = '\xff';
*dst++ = ch;
/* IAC WILL/WONT/DO/DONT */
if (ch == 0xfb || ch == 0xfc || ch == 0xfc ||
ch == 0xfe)
state = s_iac_3byte;
else
state = stored_state;
break;
case s_iac_3byte:
/* third byte of 3-byte command */
*dst++ = ch;
state = stored_state;
break;
}
}
return (dst - origdst);
}