ModErn Text Analysis
META Enumerates Textual Applications
detail.h
Go to the documentation of this file.
1 
9 #ifndef META_UTF_DETAIL_H_
10 #define META_UTF_DETAIL_H_
11 
12 #include <array>
13 #include <stdexcept>
14 
15 #include <unicode/uclean.h>
16 #include <unicode/unistr.h>
17 
18 namespace meta
19 {
20 namespace utf
21 {
22 
28 {
29  private:
30  icu_handle()
31  {
32  auto status = U_ZERO_ERROR;
33  u_init(&status);
34  if (!U_SUCCESS(status))
35  throw std::runtime_error{"Failed to initialize icu"};
36  }
37 
38  public:
43  inline static icu_handle& get()
44  {
45  static icu_handle handle;
46  return handle;
47  }
48 
53  {
54  u_cleanup();
55  }
56 };
57 
64 inline std::u16string icu_to_u16str(const icu::UnicodeString& icu_str)
65 {
66  std::u16string u16str;
67  u16str.resize(icu_str.length());
68  auto status = U_ZERO_ERROR;
69  // looks dangerous, actually isn't: UChar is guaranteed to be a 16-bit
70  // integer type, so all we're doing here is going between signed vs.
71  // unsigned
72  icu_str.extract(reinterpret_cast<UChar*>(&u16str[0]), u16str.length(),
73  status);
74  return u16str;
75 }
76 
83 inline std::string icu_to_u8str(const icu::UnicodeString& icu_str)
84 {
85  std::string u8str;
86  icu_str.toUTF8String(u8str);
87  return u8str;
88 }
89 
95 inline void utf8_append_codepoint(std::string& dest, uint32_t codepoint)
96 {
97  std::array<uint8_t, U8_MAX_LENGTH> buf;
98  int32_t len = 0;
99  UBool err = FALSE;
100  U8_APPEND(&buf[0], len, U8_MAX_LENGTH, codepoint, err);
101  if (err)
102  throw std::runtime_error{"failed to add codepoint to string"};
103  dest.append(reinterpret_cast<char*>(&buf[0]), len);
104 }
105 }
106 }
107 #endif
void utf8_append_codepoint(std::string &dest, uint32_t codepoint)
Helper method that appends a UTF-32 codepoint to the given utf8 string.
Definition: detail.h:95
~icu_handle()
Destructor.
Definition: detail.h:52
std::string icu_to_u8str(const icu::UnicodeString &icu_str)
Helper method that converts an ICU string to a std::string in utf8.
Definition: detail.h:83
The ModErn Text Analysis toolkit is a suite of natural language processing, classification, information retreival, data mining, and other applications of text processing.
Definition: analyzer.h:24
std::u16string icu_to_u16str(const icu::UnicodeString &icu_str)
Helper method that converts an ICU string to a std::u16string.
Definition: detail.h:64
Internal class that ensures that ICU cleans up all of its "still-reachable" memory before program ter...
Definition: detail.h:27