Skip to content

Commit

Permalink
Validate UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
flowerysong committed Nov 2, 2024
1 parent 0e24211 commit 59249b8
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 1 deletion.
12 changes: 11 additions & 1 deletion libopenarc/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1479,11 +1479,17 @@ arc_process_set(ARC_MESSAGE *msg,
set->set_data = hcopy;
set->set_bad = false;

if (!arc_check_utf8(hcopy))
{
arc_error(msg, "invalid UTF-8 in %s data", settype);
set->set_bad = true;
return ARC_STAT_SYNTAX;
}

for (p = hcopy; *p != '\0' && !stop; p++)
{
if (isascii(*p) && !isprint(*p) && !isspace(*p))
{
/* FIXME: should this do more validation of UTF-8? */
arc_error(
msg, "invalid character (ASCII 0x%02x at offset %d) in %s data",
*p, p - hcopy, settype);
Expand Down Expand Up @@ -2480,6 +2486,10 @@ arc_parse_header_field(ARC_MESSAGE *msg,
assert(hlen != 0);

/* enforce RFC 5322, Section 2.2 as extended by RFC 6532, Section 3.2 */
if (!arc_check_utf8(hdr))
{
return ARC_STAT_SYNTAX;
}
colon = NULL;
for (c = 0; c < hlen; c++)
{
Expand Down
91 changes: 91 additions & 0 deletions util/arc-dstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <limits.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -744,3 +745,93 @@ arc_lowercase(char *str)
}
}
}

/**
* Check whether a string is valid UTF-8
*
* Parameters:
* str: string to check
*
* Returns:
* Whether the string passed the checks.
*/

bool
arc_check_utf8(const char *str)
{
size_t charlen;
uint32_t u;
uint8_t mask;

for (const unsigned char *p = (const unsigned char *) str; *p != '\0'; p++)
{
if (*p < 0x80)
{
continue;
}

if ((*p & 0xe0) == 0xc0)
{
charlen = 2;
mask = 0x1f;
}
else if ((*p & 0xf0) == 0xe0)
{
charlen = 3;
mask = 0x0f;
}
else if ((*p & 0xf8) == 0xf0)
{
charlen = 4;
mask = 0x07;
}
else
{
/* Anything else that has the high bit set is invalid. */
return false;
}

u = *p & mask;
for (int i = 1; i < charlen; i++)
{
p++;
if ((*p & 0xc0) != 0x80)
{
return false;
}
u <<= 6;
u |= (*p & 0x3f);
}

/* Check that the codepoint used the shortest representation. */
if ((u < 0x80) || ((u < 0x800) && (charlen > 2)) ||
((u < 0x10000) && (charlen > 3)))
{
return false;
}

/* Check for invalid codepoints. */

/* surrogates */
if (u >= 0xd800 && u <= 0xdfff)
{
return false;
}

/* non-characters */
if ((u >= 0xfdd0 && u <= 0xfdef) || u == 0xfffe || u == 0xffff ||
u == 0x1fffe || u == 0x1ffff || u == 0x2fffe || u == 0x2ffff ||
u == 0x3fffe || u == 0x3ffff || u == 0x4fffe || u == 0x4ffff ||
u == 0x5fffe || u == 0x5ffff || u == 0x6fffe || u == 0x6ffff ||
u == 0x7fffe || u == 0x7ffff || u == 0x8fffe || u == 0x8ffff ||
u == 0x9fffe || u == 0x9ffff || u == 0xafffe || u == 0xaffff ||
u == 0xbfffe || u == 0xbffff || u == 0xcfffe || u == 0xcffff ||
u == 0xdfffe || u == 0xdffff || u == 0xefffe || u == 0xeffff ||
u == 0xffffe || u == 0xfffff || u == 0x10fffe || u == 0x10ffff)
{
return false;
}
}

return true;
}
1 change: 1 addition & 0 deletions util/arc-dstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,6 @@ extern void arc_clobber_array(char **);
extern void arc_collapse(char *);
extern char **arc_copy_array(char **);
extern void arc_lowercase(char *);
extern bool arc_check_utf8(const char *);

#endif /* ARC_DSTRING_H_ */

0 comments on commit 59249b8

Please sign in to comment.