SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
csa_wt.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CSA_WT
9#define INCLUDED_SDSL_CSA_WT
10
11#include <cassert>
12#include <iostream>
13#include <stddef.h>
14#include <stdint.h>
15#include <string>
16#include <type_traits>
17
18#include <sdsl/cereal.hpp>
19#include <sdsl/config.hpp>
22#include <sdsl/int_vector.hpp>
24#include <sdsl/io.hpp>
25#include <sdsl/iterators.hpp>
30#include <sdsl/util.hpp>
31#include <sdsl/wt_huff.hpp>
32
33namespace sdsl
34{
35
38
49template <class t_wt = wt_huff<>, // Wavelet tree type
50 uint32_t t_dens = 32, // Sample density for suffix array (SA) values
51 uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
52 class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
53 class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
54 class t_alphabet_strat = // Policy class for the representation of the alphabet.
55 typename wt_alphabet_trait<t_wt>::type>
56class csa_wt
57{
58 static_assert(std::is_same<typename index_tag<t_wt>::type, wt_tag>::value,
59 "First template argument has to be a wavelet tree type.");
60 static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
61 static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
62 static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
63 "Forth template argument has to be a suffix array sampling strategy.");
64 static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
65 "Fifth template argument has to be a inverse suffix array sampling strategy.");
66 static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
67
68 friend class bwt_of_csa_wt<csa_wt>;
69
70public:
71 enum
72 {
74 isa_sample_dens = t_inv_dens
75 };
76
77 typedef uint64_t value_type;
83 typedef const pointer const_pointer;
86 typedef ptrdiff_t difference_type;
93 typedef t_wt wavelet_tree_type;
94 typedef typename t_sa_sample_strat::template type<csa_wt> sa_sample_type;
95 typedef typename t_isa_sample_strat::template type<csa_wt> isa_sample_type;
96 typedef t_alphabet_strat alphabet_type;
97 typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
98 typedef typename alphabet_type::comp_char_type comp_char_type;
99 typedef typename alphabet_type::string_type string_type;
101
104 typedef typename alphabet_type::alphabet_category alphabet_category;
105
106private:
107 t_wt m_wavelet_tree; // the wavelet tree
108 sa_sample_type m_sa_sample; // suffix array samples
109 isa_sample_type m_isa_sample; // inverse suffix array samples
110 alphabet_type m_alphabet;
111//#define USE_CSA_CACHE
112#ifdef USE_CSA_CACHE
113 mutable fast_cache csa_cache;
114#endif
115
116public:
117 const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
118 const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
119 const typename alphabet_type::C_type & C = m_alphabet.C;
120 const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
121 const psi_type psi = psi_type(*this);
122 const lf_type lf = lf_type(*this);
123 const bwt_type bwt = bwt_type(*this);
124 const text_type text = text_type(*this);
126 const bwt_type L = bwt_type(*this);
127 const isa_type isa = isa_type(*this);
128 sa_sample_type const & sa_sample = m_sa_sample;
129 isa_sample_type const & isa_sample = m_isa_sample;
130 wavelet_tree_type const & wavelet_tree = m_wavelet_tree;
131
133 csa_wt() = default;
134
136 csa_wt(csa_wt const & csa) :
137 m_wavelet_tree(csa.m_wavelet_tree),
138 m_sa_sample(csa.m_sa_sample),
139 m_isa_sample(csa.m_isa_sample),
140 m_alphabet(csa.m_alphabet)
141 {
142 m_isa_sample.set_vector(&m_sa_sample);
143 }
144
146 csa_wt(csa_wt && csa) :
147 m_wavelet_tree(std::move(csa.m_wavelet_tree)),
148 m_sa_sample(std::move(csa.m_sa_sample)),
149 m_isa_sample(std::move(csa.m_isa_sample)),
150 m_alphabet(std::move(csa.m_alphabet))
151 {
152 m_isa_sample.set_vector(&m_sa_sample);
153 }
154
156 csa_wt(cache_config & config);
157
159
165 {
166 return m_wavelet_tree.size();
167 }
168
170
174 {
175 return bit_vector::max_size();
176 }
177
179
182 bool empty() const
183 {
184 return m_wavelet_tree.empty();
185 }
186
188
192 {
193 return const_iterator(this, 0);
194 }
195
197
201 {
202 return const_iterator(this, size());
203 }
204
206
212 inline value_type operator[](size_type i) const;
213
215
218 csa_wt & operator=(csa_wt const & csa)
219 {
220 if (this != &csa)
221 {
222 csa_wt tmp(csa);
223 *this = std::move(tmp);
224 }
225 return *this;
226 }
227
229
233 {
234 if (this != &csa)
235 {
236 m_wavelet_tree = std::move(csa.m_wavelet_tree);
237 m_sa_sample = std::move(csa.m_sa_sample);
238 m_isa_sample = std::move(csa.m_isa_sample);
239 m_isa_sample.set_vector(&m_sa_sample);
240 m_alphabet = std::move(csa.m_alphabet);
241 }
242 return *this;
243 }
244
246 bool operator==(csa_wt const & other) const noexcept
247 {
248 return (m_wavelet_tree == other.m_wavelet_tree) && (m_sa_sample == other.m_sa_sample)
249 && (m_isa_sample == other.m_isa_sample) && (m_alphabet == other.m_alphabet);
250 }
251
253 bool operator!=(csa_wt const & other) const noexcept
254 {
255 return !(*this == other);
256 }
257
259
262 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
263
265
267 void load(std::istream & in);
268
270 template <typename archive_t>
271 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
272
274 template <typename archive_t>
275 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
276
277private:
278 // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
279 /*
280 * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
281 * \param c The symbol to count the occurrences in the prefix.
282 * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
283 * \par Time complexity
284 * \f$ \Order{\log |\Sigma|} \f$
285 */
286 size_type rank_bwt(size_type i, const char_type c) const
287 {
288 return m_wavelet_tree.rank(i, c);
289 }
290
291 // Calculates the position of the i-th c in the BWT of the original text.
292 /*
293 * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
294 * \param c Symbol c.
295 * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
296 * \par Time complexity
297 * \f$ \Order{t_{\Psi}} \f$
298 */
299 size_type select_bwt(size_type i, const char_type c) const
300 {
301 assert(i > 0);
302 char_type cc = char2comp[c];
303 if (cc == 0 and c != 0) // character is not in the text => return size()
304 return size();
305 assert(cc != 255);
306 if (C[cc] + i - 1 < C[cc + 1])
307 {
308 return m_wavelet_tree.select(i, c);
309 }
310 else
311 return size();
312 }
313};
314
315// == template functions ==
316
317template <class t_wt,
318 uint32_t t_dens,
319 uint32_t t_inv_dens,
320 class t_sa_sample_strat,
321 class t_isa,
322 class t_alphabet_strat>
324{
326 {
327 return;
328 }
329 {
330 auto event = memory_monitor::event("construct csa-alpbabet");
333 size_type n = bwt_buf.size();
334 m_alphabet = alphabet_type(bwt_buf, n);
335 }
336 {
337 auto event = memory_monitor::event("sample SA");
338 m_sa_sample = sa_sample_type(config);
339 }
340 {
341 auto event = memory_monitor::event("sample ISA");
342 isa_sample_type isa_s(config, &m_sa_sample);
343 util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample);
344 }
345
346 // if ( config.delete_files ) {
347 // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
348 // }
349 {
350 auto event = memory_monitor::event("construct wavelet tree");
353 m_wavelet_tree = wavelet_tree_type(bwt_buf.begin(), bwt_buf.end(), config.dir);
354 }
355}
356
357template <class t_wt,
358 uint32_t t_dens,
359 uint32_t t_inv_dens,
360 class t_sa_sample_strat,
361 class t_isa,
362 class t_alphabet_strat>
364 -> value_type
365{
366 size_type off = 0;
367 while (!m_sa_sample.is_sampled(i))
368 {
369 i = lf[i];
370 ++off;
371 }
372 value_type result = m_sa_sample[i];
373 if (result + off < size())
374 {
375 return result + off;
376 }
377 else
378 {
379 return result + off - size();
380 }
381}
382
383template <class t_wt,
384 uint32_t t_dens,
385 uint32_t t_inv_dens,
386 class t_sa_sample_strat,
387 class t_isa,
388 class t_alphabet_strat>
391 std::string name) const
392 -> size_type
393{
394 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
395 size_type written_bytes = 0;
396 written_bytes += m_wavelet_tree.serialize(out, child, "wavelet_tree");
397 written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
398 written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
399 written_bytes += m_alphabet.serialize(out, child, "alphabet");
400 structure_tree::add_size(child, written_bytes);
401 return written_bytes;
402}
403
404template <class t_wt,
405 uint32_t t_dens,
406 uint32_t t_inv_dens,
407 class t_sa_sample_strat,
408 class t_isa,
409 class t_alphabet_strat>
411{
412 m_wavelet_tree.load(in);
413 m_sa_sample.load(in);
414 m_isa_sample.load(in, &m_sa_sample);
415 m_alphabet.load(in);
416}
417
418template <class t_wt,
419 uint32_t t_dens,
420 uint32_t t_inv_dens,
421 class t_sa_sample_strat,
422 class t_isa,
423 class t_alphabet_strat>
424template <typename archive_t>
426 archive_t & ar) const
427{
428 ar(CEREAL_NVP(m_wavelet_tree));
429 ar(CEREAL_NVP(m_sa_sample));
430 ar(CEREAL_NVP(m_isa_sample));
431 ar(CEREAL_NVP(m_alphabet));
432}
433
434template <class t_wt,
435 uint32_t t_dens,
436 uint32_t t_inv_dens,
437 class t_sa_sample_strat,
438 class t_isa,
439 class t_alphabet_strat>
440template <typename archive_t>
442 archive_t & ar)
443{
444 ar(CEREAL_NVP(m_wavelet_tree));
445 ar(CEREAL_NVP(m_sa_sample));
446 ar(CEREAL_NVP(m_isa_sample));
447 m_isa_sample.set_vector(&m_sa_sample);
448 ar(CEREAL_NVP(m_alphabet));
449}
450
451} // end namespace sdsl
452#endif
cereal.hpp offers cereal support
#define CEREAL_NVP(X)
Definition cereal.hpp:31
#define CEREAL_LOAD_FUNCTION_NAME
Definition cereal.hpp:34
#define CEREAL_SAVE_FUNCTION_NAME
Definition cereal.hpp:35
A class for the Compressed Suffix Array (CSA) based on a Wavelet Tree (WT) of the Burrow Wheeler Tran...
Definition csa_wt.hpp:57
bool empty() const
Returns if the data strucutre is empty.
Definition csa_wt.hpp:182
random_access_const_iterator< csa_wt > const_iterator
Definition csa_wt.hpp:78
t_isa_sample_strat::template type< csa_wt > isa_sample_type
Definition csa_wt.hpp:95
csa_wt(csa_wt &&csa)
Move constructor.
Definition csa_wt.hpp:146
bool operator!=(csa_wt const &other) const noexcept
Inequality operator.
Definition csa_wt.hpp:253
csa_wt(csa_wt const &csa)
Copy constructor.
Definition csa_wt.hpp:136
ptrdiff_t difference_type
Definition csa_wt.hpp:86
const_iterator begin() const
Returns a const_iterator to the first element.
Definition csa_wt.hpp:191
void load(std::istream &in)
Load from a stream.
Definition csa_wt.hpp:410
csa_wt & operator=(csa_wt &&csa)
Assignment Move Operator.
Definition csa_wt.hpp:232
const psi_type psi
Definition csa_wt.hpp:121
wavelet_tree_type const & wavelet_tree
Definition csa_wt.hpp:130
const first_row_type F
Definition csa_wt.hpp:125
csa_tag index_category
Definition csa_wt.hpp:102
const bwt_type bwt
Definition csa_wt.hpp:123
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition csa_wt.hpp:425
const alphabet_type::char2comp_type & char2comp
Definition csa_wt.hpp:117
const value_type const_reference
Definition csa_wt.hpp:80
traverse_csa_wt< csa_wt, true > psi_type
Definition csa_wt.hpp:87
static size_type max_size()
Returns the largest size that csa_wt can ever have.
Definition csa_wt.hpp:173
const text_type text
Definition csa_wt.hpp:124
const alphabet_type::sigma_type & sigma
Definition csa_wt.hpp:120
alphabet_type::string_type string_type
Definition csa_wt.hpp:99
const_iterator iterator
Definition csa_wt.hpp:79
bool operator==(csa_wt const &other) const noexcept
Equality operator.
Definition csa_wt.hpp:246
@ isa_sample_dens
Definition csa_wt.hpp:74
text_of_csa< csa_wt > text_type
Definition csa_wt.hpp:92
t_sa_sample_strat::template type< csa_wt > sa_sample_type
Definition csa_wt.hpp:94
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition csa_wt.hpp:389
const_reference * pointer
Definition csa_wt.hpp:82
size_type csa_size_type
Definition csa_wt.hpp:85
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Serialise (load) via cereal.
Definition csa_wt.hpp:441
value_type operator[](size_type i) const
[]-operator
Definition csa_wt.hpp:363
const lf_type lf
Definition csa_wt.hpp:122
bwt_of_csa_wt< csa_wt > bwt_type
Definition csa_wt.hpp:89
const_reference reference
Definition csa_wt.hpp:81
const alphabet_type::C_type & C
Definition csa_wt.hpp:119
alphabet_type::char_type char_type
Definition csa_wt.hpp:97
size_type size() const
Number of elements in the .
Definition csa_wt.hpp:164
isa_sample_type const & isa_sample
Definition csa_wt.hpp:129
isa_of_csa_wt< csa_wt > isa_type
Definition csa_wt.hpp:90
csa_wt()=default
Default constructor.
const isa_type isa
Definition csa_wt.hpp:127
alphabet_type::comp_char_type comp_char_type
Definition csa_wt.hpp:98
alphabet_type::alphabet_category alphabet_category
Definition csa_wt.hpp:104
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition csa_wt.hpp:200
sa_sample_type const & sa_sample
Definition csa_wt.hpp:128
lf_tag extract_category
Definition csa_wt.hpp:103
int_vector ::size_type size_type
Definition csa_wt.hpp:84
first_row_of_csa< csa_wt > first_row_type
Definition csa_wt.hpp:91
t_alphabet_strat alphabet_type
Definition csa_wt.hpp:96
traverse_csa_wt< csa_wt, false > lf_type
Definition csa_wt.hpp:88
csa_wt csa_type
Definition csa_wt.hpp:100
t_wt wavelet_tree_type
Definition csa_wt.hpp:93
const bwt_type L
Definition csa_wt.hpp:126
const pointer const_pointer
Definition csa_wt.hpp:83
uint64_t value_type
Definition csa_wt.hpp:77
const alphabet_type::comp2char_type & comp2char
Definition csa_wt.hpp:118
csa_wt & operator=(csa_wt const &csa)
Assignment Operator.
Definition csa_wt.hpp:218
uint64_t size() const
Returns the number of elements currently stored.
int_vector_size_type size_type
static size_type max_size() noexcept
static mm_event_proxy event(std::string const &name)
Generic iterator for a random access container.
Definition iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, std::string const &name, std::string const &type)
static void add_size(structure_tree_node *v, uint64_t value)
A wrapper class for the and LF function for (compressed) suffix arrays that are based on a wavelet t...
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
iterators.hpp contains an generic iterator for random access containers.
memory_tracking.hpp contains two function for allocating and deallocating memory
Namespace for the succinct data structure library.
bool cache_file_exists(std::string const &key, cache_config const &config)
Checks if the resource specified by the key exists in the cache.
Definition io.hpp:733
std::string cache_file_name(std::string const &key, cache_config const &config)
Returns the file name of the resource.
Definition io.hpp:688
constexpr char const * key_bwt()
int_vector ::size_type size(range_type const &r)
Size of a range.
Contains declarations and definitions of data structure concepts.
Helper class for construction process.
Definition config.hpp:66
std::string dir
Definition config.hpp:70
structure_tree.hpp contains a helper class which can represent the memory structure of a class.
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.