LTP GCOV extension - code coverage report
Current view: directory - usr/include/xapian - unicode.h
Test: lcov.info
Date: 2008-08-14 Instrumented lines: 8
Code covered: 87.5 % Executed lines: 7

       1                 : /** @file unicode.h
       2                 :  * @brief Unicode and UTF-8 related classes and functions.
       3                 :  */
       4                 : /* Copyright (C) 2006,2007,2008 Olly Betts
       5                 :  *
       6                 :  * This program is free software; you can redistribute it and/or modify
       7                 :  * it under the terms of the GNU General Public License as published by
       8                 :  * the Free Software Foundation; either version 2 of the License, or
       9                 :  * (at your option) any later version.
      10                 :  *
      11                 :  * This program is distributed in the hope that it will be useful,
      12                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14                 :  * GNU General Public License for more details.
      15                 :  *
      16                 :  * You should have received a copy of the GNU General Public License
      17                 :  * along with this program; if not, write to the Free Software
      18                 :  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
      19                 :  */
      20                 : 
      21                 : #ifndef XAPIAN_INCLUDED_UNICODE_H
      22                 : #define XAPIAN_INCLUDED_UNICODE_H
      23                 : 
      24                 : #include <xapian/visibility.h>
      25                 : 
      26                 : #include <string>
      27                 : 
      28                 : namespace Xapian {
      29                 : 
      30                 : /** An iterator which returns unicode character values from a UTF-8 encoded
      31                 :  *  string.
      32                 :  */
      33                 : class XAPIAN_VISIBILITY_DEFAULT Utf8Iterator {
      34                 :     const unsigned char *p;
      35                 :     const unsigned char *end;
      36                 :     mutable unsigned seqlen;
      37                 : 
      38                 :     void calculate_sequence_length() const;
      39                 : 
      40                 :     unsigned get_char() const;
      41                 : 
      42                 :     Utf8Iterator(const unsigned char *p_, const unsigned char *end_, unsigned seqlen_)
      43                 :         : p(p_), end(end_), seqlen(seqlen_) { }
      44                 : 
      45                 :   public:
      46                 :     /** Return the raw const char * pointer for the current position. */
      47                 :     const char * raw() const {
      48                 :         return reinterpret_cast<const char *>(p ? p : end);
      49                 :     }
      50                 : 
      51                 :     /** Return the number of bytes left in the iterator's buffer. */
      52                 :     size_t left() const { return p ? end - p : 0; }
      53                 : 
      54                 :     /** Assign a new string to the iterator.
      55                 :      *
      56                 :      *  The iterator will forget the string it was iterating through, and
      57                 :      *  return characters from the start of the new string when next called.
      58                 :      *  The string is not copied into the iterator, so it must remain valid
      59                 :      *  while the iteration is in progress.
      60                 :      *
      61                 :      *  @param p A pointer to the start of the string to read.
      62                 :      *
      63                 :      *  @param len The length of the string to read.
      64                 :      */
      65            5208 :     void assign(const char *p_, size_t len) {
      66            5208 :         if (len) {
      67            5208 :             p = reinterpret_cast<const unsigned char*>(p_);
      68            5208 :             end = p + len;
      69            5208 :             seqlen = 0;
      70                 :         } else {
      71               0 :             p = NULL;
      72                 :         }
      73            5208 :     }
      74                 : 
      75                 :     /** Assign a new string to the iterator.
      76                 :      *
      77                 :      *  The iterator will forget the string it was iterating through, and
      78                 :      *  return characters from the start of the new string when next called.
      79                 :      *  The string is not copied into the iterator, so it must remain valid
      80                 :      *  while the iteration is in progress.
      81                 :      *
      82                 :      *  @param s The string to read.  Must not be modified while the iteration
      83                 :      *           is in progress.
      84                 :      */
      85                 :     void assign(const std::string &s) { assign(s.data(), s.size()); }
      86                 : 
      87                 :     /** Create an iterator given a pointer to a null terminated string.
      88                 :      *
      89                 :      *  The iterator will return characters from the start of the string when
      90                 :      *  next called.  The string is not copied into the iterator, so it must
      91                 :      *  remain valid while the iteration is in progress.
      92                 :      *
      93                 :      *  @param p A pointer to the start of the null terminated string to read.
      94                 :      */
      95                 :     explicit Utf8Iterator(const char *p_);
      96                 : 
      97                 :     /** Create an iterator given a pointer and a length.
      98                 :      *
      99                 :      *  The iterator will return characters from the start of the string when
     100                 :      *  next called.  The string is not copied into the iterator, so it must
     101                 :      *  remain valid while the iteration is in progress.
     102                 :      *
     103                 :      *  @param p A pointer to the start of the string to read.
     104                 :      *
     105                 :      *  @param len The length of the string to read.
     106                 :      */
     107                 :     Utf8Iterator(const char *p_, size_t len) { assign(p_, len); }
     108                 : 
     109                 :     /** Create an iterator given a string.
     110                 :      *
     111                 :      *  The iterator will return characters from the start of the string when
     112                 :      *  next called.  The string is not copied into the iterator, so it must
     113                 :      *  remain valid while the iteration is in progress.
     114                 :      *
     115                 :      *  @param s The string to read.  Must not be modified while the iteration
     116                 :      *           is in progress.
     117                 :      */
     118            5208 :     Utf8Iterator(const std::string &s) { assign(s.data(), s.size()); }
     119                 : 
     120                 :     /** Create an iterator which is at the end of its iteration.
     121                 :      *
     122                 :      *  This can be compared to another iterator to check if the other iterator
     123                 :      *  has reached its end.
     124                 :      */
     125                 :     Utf8Iterator() : p(NULL), end(0), seqlen(0) { }
     126                 : 
     127                 :     /** Get the current unicode character value pointed to by the iterator.
     128                 :      *
     129                 :      *  Returns unsigned(-1) if the iterator has reached the end of its buffer.
     130                 :      */
     131                 :     unsigned operator*() const;
     132                 : 
     133                 :     /** Move forward to the next unicode character.
     134                 :      *
     135                 :      *  @return An iterator pointing to the position before the move.
     136                 :      */
     137                 :     Utf8Iterator operator++(int) {
     138                 :         // If we've not calculated seqlen yet, do so.
     139                 :         if (seqlen == 0) calculate_sequence_length();
     140                 :         const unsigned char *old_p = p;
     141                 :         unsigned old_seqlen = seqlen;
     142                 :         p += seqlen;
     143                 :         if (p == end) p = NULL;
     144                 :         seqlen = 0;
     145                 :         return Utf8Iterator(old_p, end, old_seqlen);
     146                 :     }
     147                 : 
     148                 :     /** Move forward to the next unicode character.
     149                 :      *
     150                 :      *  @return A reference to this object.
     151                 :      */
     152                 :     Utf8Iterator & operator++() {
     153                 :         if (seqlen == 0) calculate_sequence_length();
     154                 :         p += seqlen;
     155                 :         if (p == end) p = NULL;
     156                 :         seqlen = 0;
     157                 :         return *this;
     158                 :     }
     159                 : 
     160                 :     /** Test two Utf8Iterators for equality.
     161                 :      *
     162                 :      *  @return true iff the iterators point to the same position.
     163                 :      */
     164                 :     bool operator==(const Utf8Iterator &other) const { return p == other.p; }
     165                 : 
     166                 :     /** Test two Utf8Iterators for inequality.
     167                 :      *
     168                 :      *  @return true iff the iterators do not point to the same position.
     169                 :      */
     170                 :     bool operator!=(const Utf8Iterator &other) const { return p != other.p; }
     171                 : 
     172                 :     /// We implement the semantics of an STL input_iterator.
     173                 :     //@{
     174                 :     typedef std::input_iterator_tag iterator_category;
     175                 :     typedef unsigned value_type;
     176                 :     typedef size_t difference_type;
     177                 :     typedef const unsigned * pointer;
     178                 :     typedef const unsigned & reference;
     179                 :     //@}
     180                 : };
     181                 : 
     182                 : namespace Unicode {
     183                 : 
     184                 : /** Each unicode character is in one of these categories. */
     185                 : typedef enum {
     186                 :     UNASSIGNED,
     187                 :     UPPERCASE_LETTER,
     188                 :     LOWERCASE_LETTER,
     189                 :     TITLECASE_LETTER,
     190                 :     MODIFIER_LETTER,
     191                 :     OTHER_LETTER,
     192                 :     NON_SPACING_MARK,
     193                 :     ENCLOSING_MARK,
     194                 :     COMBINING_SPACING_MARK,
     195                 :     DECIMAL_DIGIT_NUMBER,
     196                 :     LETTER_NUMBER,
     197                 :     OTHER_NUMBER,
     198                 :     SPACE_SEPARATOR,
     199                 :     LINE_SEPARATOR,
     200                 :     PARAGRAPH_SEPARATOR,
     201                 :     CONTROL,
     202                 :     FORMAT,
     203                 :     PRIVATE_USE,
     204                 :     SURROGATE,
     205                 :     CONNECTOR_PUNCTUATION,
     206                 :     DASH_PUNCTUATION,
     207                 :     OPEN_PUNCTUATION,
     208                 :     CLOSE_PUNCTUATION,
     209                 :     INITIAL_QUOTE_PUNCTUATION,
     210                 :     FINAL_QUOTE_PUNCTUATION,
     211                 :     OTHER_PUNCTUATION,
     212                 :     MATH_SYMBOL,
     213                 :     CURRENCY_SYMBOL,
     214                 :     MODIFIER_SYMBOL,
     215                 :     OTHER_SYMBOL
     216                 : } category;
     217                 : 
     218                 : namespace Internal {
     219                 :     /** @internal Extract the information about a character from the Unicode
     220                 :      *  character tables.
     221                 :      *
     222                 :      *  ch must be a valid Unicode character value (i.e. < 0x110000)
     223                 :      */
     224                 :     XAPIAN_VISIBILITY_DEFAULT
     225                 :     int get_character_info(unsigned ch);
     226                 : 
     227                 :     /** @internal Extract how to convert the case of a unicode character from
     228                 :      *  its info.
     229                 :      */
     230                 :     inline int get_case_type(int info) { return ((info & 0xe0) >> 5); }
     231                 : 
     232                 :     /// @internal Extract the category of a unicode character from its info.
     233                 :     inline category get_category(int info) { return static_cast<category>(info & 0x1f); }
     234                 : 
     235                 :     /** @internal Extract the delta to use for case conversion of a character
     236                 :      *  from its info.
     237                 :      */
     238                 :     inline int get_delta(int info) {
     239                 :         /* It's implementation defined if sign extension happens on right shift
     240                 :          * of a signed int, hence the conditional (hopefully the compiler will
     241                 :          * spot this and optimise it to a sign-extending shift on architectures
     242                 :          * with a suitable instruction).
     243                 :          */
     244                 :         return (info >= 0) ? (info >> 15) : (~(~info >> 15));
     245                 :     }
     246                 : }
     247                 : 
     248                 : /** Convert a single non-ASCII unicode character to UTF-8.
     249                 :  *
     250                 :  *  This is intended mainly as a helper method for to_utf8().
     251                 :  *
     252                 :  *  The character @a ch (which must be > 128) is written to the buffer @a buf
     253                 :  *  and the length of the resultant UTF-8 character is returned.
     254                 :  *
     255                 :  *  NB buf must have space for (at least) 4 bytes.
     256                 :  */
     257                 : XAPIAN_VISIBILITY_DEFAULT
     258                 : unsigned nonascii_to_utf8(unsigned ch, char * buf);
     259                 : 
     260                 : /** Convert a single unicode character to UTF-8.
     261                 :  *
     262                 :  *  The character @a ch is written to the buffer @a buf and the length of the
     263                 :  *  resultant UTF-8 character is returned.
     264                 :  *
     265                 :  *  NB buf must have space for (at least) 4 bytes.
     266                 :  */
     267                 : inline unsigned to_utf8(unsigned ch, char *buf) {
     268                 :     if (ch < 128) {
     269                 :         *buf = static_cast<unsigned char>(ch);
     270                 :         return 1;
     271                 :     }
     272                 :     return Xapian::Unicode::nonascii_to_utf8(ch, buf);
     273                 : }
     274                 : 
     275                 : /** Append the UTF-8 representation of a single unicode character to a
     276                 :  *  std::string.
     277                 :  */
     278                 : inline void append_utf8(std::string &s, unsigned ch) {
     279                 :     char buf[4];
     280                 :     s.append(buf, to_utf8(ch, buf));
     281                 : }
     282                 : 
     283                 : /// Return the category which a given unicode character falls into.
     284                 : inline category get_category(unsigned ch) {
     285                 :     // Categorise non-Unicode values as UNASSIGNED.
     286                 :     if (ch >= 0x110000) return Xapian::Unicode::UNASSIGNED;
     287                 :     return Internal::get_category(Internal::get_character_info(ch));
     288                 : }
     289                 : 
     290                 : /// Test is a given unicode character is a letter or number.
     291                 : inline bool is_wordchar(unsigned ch) {
     292                 :     const unsigned int WORDCHAR_MASK =
     293                 :             (1 << Xapian::Unicode::UPPERCASE_LETTER) |
     294                 :             (1 << Xapian::Unicode::LOWERCASE_LETTER) |
     295                 :             (1 << Xapian::Unicode::TITLECASE_LETTER) |
     296                 :             (1 << Xapian::Unicode::MODIFIER_LETTER) |
     297                 :             (1 << Xapian::Unicode::OTHER_LETTER) |
     298                 :             (1 << Xapian::Unicode::DECIMAL_DIGIT_NUMBER) |
     299                 :             (1 << Xapian::Unicode::LETTER_NUMBER) |
     300                 :             (1 << Xapian::Unicode::OTHER_NUMBER) |
     301                 :             (1 << Xapian::Unicode::CONNECTOR_PUNCTUATION);
     302                 :     return ((WORDCHAR_MASK >> get_category(ch)) & 1);
     303                 : }
     304                 : 
     305                 : /// Test is a given unicode character is a whitespace character.
     306                 : inline bool is_whitespace(unsigned ch) {
     307                 :     const unsigned int WHITESPACE_MASK =
     308                 :             (1 << Xapian::Unicode::CONTROL) | // For TAB, CR, LF, FF.
     309                 :             (1 << Xapian::Unicode::SPACE_SEPARATOR) |
     310                 :             (1 << Xapian::Unicode::LINE_SEPARATOR) |
     311                 :             (1 << Xapian::Unicode::PARAGRAPH_SEPARATOR);
     312                 :     return ((WHITESPACE_MASK >> get_category(ch)) & 1);
     313                 : }
     314                 : 
     315                 : /// Test is a given unicode character is a currency symbol.
     316                 : inline bool is_currency(unsigned ch) {
     317                 :     return (get_category(ch) == Xapian::Unicode::CURRENCY_SYMBOL);
     318                 : }
     319                 : 
     320                 : /// Convert a unicode character to lowercase.
     321                 : inline unsigned tolower(unsigned ch) {
     322                 :     int info;
     323                 :     // Leave non-Unicode values unchanged.
     324                 :     if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 2))
     325                 :         return ch;
     326                 :     return ch + Internal::get_delta(info);
     327                 : }
     328                 : 
     329                 : /// Convert a unicode character to uppercase.
     330                 : inline unsigned toupper(unsigned ch) {
     331                 :     int info;
     332                 :     // Leave non-Unicode values unchanged.
     333                 :     if (ch >= 0x110000 || !(Internal::get_case_type((info = Xapian::Unicode::Internal::get_character_info(ch))) & 4))
     334                 :         return ch;
     335                 :     return ch - Internal::get_delta(info);
     336                 : }
     337                 : 
     338                 : /// Convert a UTF-8 std::string to lowercase.
     339                 : inline std::string
     340                 : tolower(const std::string &term)
     341                 : {
     342                 :     std::string result;
     343                 :     result.reserve(term.size());
     344                 :     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
     345                 :         append_utf8(result, tolower(*i));
     346                 :     }
     347                 :     return result;
     348                 : }
     349                 : 
     350                 : /// Convert a UTF-8 std::string to uppercase.
     351                 : inline std::string
     352                 : toupper(const std::string &term)
     353                 : {
     354                 :     std::string result;
     355                 :     result.reserve(term.size());
     356                 :     for (Utf8Iterator i(term); i != Utf8Iterator(); ++i) {
     357                 :         append_utf8(result, toupper(*i));
     358                 :     }
     359                 :     return result;
     360                 : }
     361                 : 
     362                 : }
     363                 : 
     364                 : }
     365                 : 
     366                 : #endif // XAPIAN_INCLUDED_UNICODE_H

Generated by: LTP GCOV extension version 1.6