Class util implements some utilities for NLP analyzers: "tolower" for latin alfabets, tags manipulation, wstring2number and viceversa conversions, etc. More...

#include <util.h>

Static Public Member Functions
static void	init_locale (const std::wstring &s=L"default")
	Init the locale of the program, to properly handle unicode.
static void	open_utf8_file (std::wifstream &, const std::wstring &)
	open an UTF8 file for reading
static void	open_utf8_file (std::wofstream &, const std::wstring &)
	open an UTF8 file for writting
static std::wstring	lowercase (const std::wstring &)
	Lowercase a wstring, even with latin characters.
static std::wstring	uppercase (const std::wstring &)
	uppercase a wstring, even with latin characters
static bool	has_lowercase (const std::wstring &)
	find out whether a wstring contains a lowercase char
static bool	has_alphanum (const std::wstring &)
	find out whether a char is alphanumeric, even latin caracters
static bool	is_capitalized (const std::wstring &)
	find out whether a word is capitalized
static bool	all_digits (const std::wstring &)
	find out whether a wstring is an integer number
static bool	all_caps (const std::wstring &)
	find out whether a wstring contains only uppercase letters
static std::wstring	absolute (const std::wstring &, const std::wstring &)
	filename management
static std::wstring	eliminateChars (const std::wstring &, const std::wstring &)
	eliminate from a wstring a list of chars
static std::vector< std::wstring >	split (const std::wstring &, const std::wstring &)
	the split from perl in c++ version
static void	find_and_replace (std::wstring &, const std::wstring &, const std::wstring &)
	wstring handling
static int	wstring2int (const std::wstring &)
	conversion utilities
static std::wstring	int2wstring (const int)
	Type conversion.
static double	wstring2double (const std::wstring &)
	Type conversion.
static std::wstring	double2wstring (const double)
	Type conversion.
static long double	wstring2longdouble (const std::wstring &)
	Type conversion.
static std::wstring	longdouble2wstring (const long double)
	Type conversion.
static std::wstring	vector2wstring (const std::vector< std::wstring > &, const std::wstring &)
	Create a single wstring concatenatig all wstrings in given vector with given separator.
static std::wstring	list2wstring (const std::list< std::wstring > &, const std::wstring &)
	Create a single wstring concatenatig all wstrings in given list with given separator.
static std::wstring	pairlist2wstring (const std::list< std::pair< std::wstring, double > > &, const std::wstring &, const std::wstring &)
	Create a single wstring concatenatig all elements in given list with given separators (one for list elements, one for pair elements)
static std::list< std::wstring >	wstring2list (const std::wstring &, const std::wstring &)
	Split a wstring into a list of wstrings given a separator subwstring.
static std::vector< std::wstring >	wstring2vector (const std::wstring &, const std::wstring &)
	Slpit a wstring into a vector of wstrings given a separator subwstring.
static std::set< std::wstring >	wstring2set (const std::wstring &, const std::wstring &)
	Split a wstring into a set of wstrings given a separator subwstring.
static std::wstring	set2wstring (const std::set< std::wstring > &, const std::wstring &)
	Create a single wstring concatenatig all wstrings in given set with given separator.
static std::string	wstring2string (const std::wstring &s)
	Convert a wstring to a string (possibly with utf8 chars)
static std::wstring	string2wstring (const std::string &s)
	Convert a string (possibly with utf8 chars) to a wstring.
static int	capitalization (const std::wstring &)
	Convert a wstring to a string (possibly with utf8 chars)
static std::wstring	capitalize (const std::wstring &, int, bool)
	Format a string to the specified capitalization pattern.
Static Public Attributes
static boost::u32regex	RE_has_lowercase = boost::make_u32regex(L"[[:lower:]]")
	useful regexps
static boost::u32regex	RE_has_alphanum = boost::make_u32regex(L"[[:alnum:]]")
static boost::u32regex	RE_is_capitalized = boost::make_u32regex(L"^[[:upper:]]")
static boost::u32regex	RE_all_digits = boost::make_u32regex(L"^[[:digit:]]+$")
static boost::u32regex	RE_all_caps = boost::make_u32regex(L"^[[:upper:]]+$")
static boost::u32regex	RE_initial_dot = boost::make_u32regex(L"^[[:upper:]]\\.?$")
static boost::u32regex	RE_all_caps_dot = boost::make_u32regex(L"^[[:upper:]]+\\.?$")
static boost::u32regex	RE_capitalized_dot = boost::make_u32regex(L"^([[:upper:]][[:lower:]]+\\.?)+$")
static boost::u32regex	RE_has_digits = boost::make_u32regex(L"[[:digit:]]+")
static boost::u32regex	RE_lowercase_dot = boost::make_u32regex(L"^[[:lower:]]+\\.?$")

Detailed Description

Class util implements some utilities for NLP analyzers: "tolower" for latin alfabets, tags manipulation, wstring2number and viceversa conversions, etc.

Member Function Documentation

wstring util::absolute	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

filename management

convert a relative path to absolute.

Referenced by bioner::bioner(), completer::completer(), depLabeler::depLabeler(), nec::nec(), semanticDB::semanticDB(), and ukb_wrap::ukb_wrap().

bool util::all_caps ( const std::wstring & ) [static]

find out whether a wstring contains only uppercase letters

Find out whether a string is made only of uppercase letters.

Referenced by capitalization().

bool util::all_digits ( const std::wstring & ) [static]

find out whether a wstring is an integer number

Find out whether a string is made only of numeric chars.

int util::capitalization ( const std::wstring & ) [static]

Convert a wstring to a string (possibly with utf8 chars)

References all_caps(), is_capitalized(), UPPER_1ST, UPPER_ALL, and UPPER_NONE.

Referenced by affixes::ApplyRule(), and dictionary::check_contracted().

wstring util::capitalize	(	const std::wstring &	,
		int	,
		bool
	)		`[static]`

Format a string to the specified capitalization pattern.

References UPPER_1ST, UPPER_ALL, and uppercase().

Referenced by dictionary::check_contracted(), and affixes::CheckRetokenizable().

wstring util::double2wstring ( const double x ) [static]

Type conversion.

Referenced by nec::analyze(), hmm_tagger::annotate(), probabilities::compute_probability(), probabilities::guesser(), pairlist2wstring(), hmm_tagger::ProbA_log(), hmm_tagger::ProbB_log(), and traces::trace_word().

wstring util::eliminateChars	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

eliminate from a wstring a list of chars

Auxiliar function: delete from text any char present in clist.

void util::find_and_replace	(	std::wstring &	,
		const std::wstring &	,
		const std::wstring &
	)		`[static]`

wstring handling

Replace all occurrences of s in t by r.

Referenced by phonetics::check_cond().

bool util::has_alphanum ( const std::wstring & ) [static]

find out whether a char is alphanumeric, even latin caracters

Find out whether a char is alphanum, considering latin characters.

Referenced by punts::analyze().

bool util::has_lowercase ( const std::wstring & ) [static]

find out whether a wstring contains a lowercase char

Find out whether a wstring contains lowercawse chars.

Referenced by ner_module::ValidMultiWord().

void util::init_locale ( const std::wstring & s = L"default" ) [static]

Init the locale of the program, to properly handle unicode.

Init the locale of the program.

If no parameter given, the default locale en_US.utf8 is used. If "system" is specified, the system locale is used. Otherwise, the given locale is used. In any case the selected locale is used only for alphanumerical functions (utf8 encoding, tolower, isalpha, etc) Note that for FreeLing to work with UTF8 texts, the locale must be set to some UTF-8 locale (e.g "en_US.utf8") installed in the system.

References wstring2string().

wstring util::int2wstring ( const int x ) [static]

bool util::is_capitalized ( const std::wstring & ) [static]

find out whether a word is capitalized

Find out whether a char is uppercase, considering latin characters.

Referenced by ner_module::BuildMultiword(), capitalization(), splitter::end_of_sentence(), and np::np().

wstring util::list2wstring	(	const std::list< std::wstring > &	,
		const std::wstring &
	)		`[static]`

Create a single wstring concatenatig all wstrings in given list with given separator.

Referenced by dictionary::check_contracted(), completer::complete(), completer::completer(), completer::find_grammar_rule(), sense_info::get_parents_string(), semanticDB::get_word_senses(), and dictionary::search_form().

wstring util::longdouble2wstring ( const long double x ) [static]

Type conversion.

wstring util::lowercase ( const std::wstring & ) [static]

Lowercase a wstring, even with latin characters.

Lowercase an string, possibly with accents.

Referenced by ner::ner(), dictionary::search_form(), and word::set_form().

static void util::open_utf8_file	(	std::wifstream &	,
		const std::wstring &
	)		`[static]`

static void util::open_utf8_file	(	std::wofstream &	,
		const std::wstring &
	)		`[static]`

open an UTF8 file for writting

wstring util::pairlist2wstring	(	const std::list< std::pair< std::wstring, double > > &	,
		const std::wstring &	,
		const std::wstring &
	)		`[static]`

Create a single wstring concatenatig all elements in given list with given separators (one for list elements, one for pair elements)

References double2wstring().

Referenced by analysis::get_senses_string(), and word::get_senses_string().

wstring util::set2wstring	(	const std::set< std::wstring > &	,
		const std::wstring &
	)		`[static]`

Create a single wstring concatenatig all wstrings in given set with given separator.

Referenced by completer::complete(), completer::completer(), check_wordclass::eval(), completer::find_grammar_rule(), and rule_expression::match().

vector< wstring > util::split	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

the split from perl in c++ version

Split a wstring into a vector of wstrings given a list of separators.

wstring util::string2wstring ( const std::string & s ) [static]

Convert a string (possibly with utf8 chars) to a wstring.

wstring util::uppercase ( const std::wstring & ) [static]

uppercase a wstring, even with latin characters

Uppercase an string, possibly with accents.

Referenced by capitalize(), and dictionary::check_contracted().

wstring util::vector2wstring	(	const std::vector< std::wstring > &	,
		const std::wstring &
	)		`[static]`

Create a single wstring concatenatig all wstrings in given vector with given separator.

Referenced by hmm_tagger::hmm_tagger(), and hmm_tagger::is_forbidden().

double util::wstring2double ( const std::wstring & ) [static]

Type conversion.

Referenced by probabilities::probabilities().

int util::wstring2int ( const std::wstring & ) [static]

conversion utilities

Type conversion.

Referenced by ner_module::ner_module(), and locutions::ValidMultiWord().

list< wstring > util::wstring2list	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

Split a wstring into a list of wstrings given a separator subwstring.

Referenced by affixes::ApplyRule(), dictionary::check_contracted(), affixes::CheckRetokenizable(), dictionary::dictionary(), dictionary::get_forms(), semanticDB::get_sense_words(), semanticDB::get_WN_keys(), semanticDB::get_word_senses(), dictionary::search_form(), sense_info::sense_info(), and locutions::ValidMultiWord().

long double util::wstring2longdouble ( const std::wstring & ) [static]

Type conversion.

set< wstring > util::wstring2set	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

Split a wstring into a set of wstrings given a separator subwstring.

Referenced by completer::completer().

string util::wstring2string ( const std::wstring & s ) [static]

Convert a wstring to a string (possibly with utf8 chars)

Referenced by init_locale(), and ukb_wrap::ukb_wrap().

vector< wstring > util::wstring2vector	(	const std::wstring &	,
		const std::wstring &
	)		`[static]`

Slpit a wstring into a vector of wstrings given a separator subwstring.

Referenced by hmm_tagger::hmm_tagger(), hmm_tagger::is_forbidden(), completer::match_pattern(), and completer::matching_context().

Member Data Documentation

boost::u32regex util::RE_all_caps = boost::make_u32regex(L"^[[:upper:]]+$") [static]

boost::u32regex util::RE_all_caps_dot = boost::make_u32regex(L"^[[:upper:]]+\\.?$") [static]

boost::u32regex util::RE_all_digits = boost::make_u32regex(L"^[[:digit:]]+$") [static]

boost::u32regex util::RE_capitalized_dot = boost::make_u32regex(L"^([[:upper:]][[:lower:]]+\\.?)+$") [static]

boost::u32regex util::RE_has_alphanum = boost::make_u32regex(L"[[:alnum:]]") [static]

boost::u32regex util::RE_has_digits = boost::make_u32regex(L"[[:digit:]]+") [static]

boost::u32regex util::RE_has_lowercase = boost::make_u32regex(L"[[:lower:]]") [static]

useful regexps

Create useful regexps.

boost::u32regex util::RE_initial_dot = boost::make_u32regex(L"^[[:upper:]]\\.?$") [static]

boost::u32regex util::RE_is_capitalized = boost::make_u32regex(L"^[[:upper:]]") [static]

boost::u32regex util::RE_lowercase_dot = boost::make_u32regex(L"^[[:lower:]]+\\.?$") [static]

The documentation for this class was generated from the following files:

Static Public Member Functions

Static Public Attributes

Detailed Description

Member Function Documentation

Member Data Documentation