SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
csa_sada.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CSA_SADA
9#define INCLUDED_SDSL_CSA_SADA
10
11#include <cassert>
12#include <iostream>
13#include <stddef.h>
14#include <stdint.h>
15#include <string>
16#include <type_traits>
17#include <vector>
18
19#include <sdsl/bits.hpp>
20#include <sdsl/cereal.hpp>
21#include <sdsl/config.hpp>
24#include <sdsl/enc_vector.hpp>
25#include <sdsl/int_vector.hpp>
28#include <sdsl/io.hpp>
29#include <sdsl/iterators.hpp>
34#include <sdsl/util.hpp>
35
36namespace sdsl
37{
38
40
51template <class t_enc_vec = enc_vector<>, // Vector type used to store the Psi-function
52 uint32_t t_dens = 32, // Sample density for suffix array (SA) values
53 uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
54 class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
55 class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
56 class t_alphabet_strat = byte_alphabet // Policy class for the representation of the alphabet.
57 >
59{
60 static_assert(is_enc_vec<t_enc_vec>::value, "First template argument has to be of type env_vector.");
61 static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
62 static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
63 static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
64 "Forth template argument has to be a suffix array sampling strategy.");
65 static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
66 "Fifth template argument has to be a inverse suffix array sampling strategy.");
67 static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
68
69 friend class bwt_of_csa_psi<csa_sada>;
70
71public:
72 enum
73 {
75 isa_sample_dens = t_inv_dens
76 };
77
78 typedef uint64_t value_type;
84 typedef const pointer const_pointer;
87 typedef ptrdiff_t difference_type;
88 typedef t_enc_vec enc_vector_type;
95 typedef typename t_sa_sample_strat::template type<csa_sada> sa_sample_type;
96 typedef typename t_isa_sample_strat::template type<csa_sada> isa_sample_type;
97 typedef t_alphabet_strat alphabet_type;
98 typedef typename alphabet_type::alphabet_category alphabet_category;
99 typedef typename alphabet_type::comp_char_type comp_char_type;
100 typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
101 typedef typename alphabet_type::string_type string_type;
103
106
107 friend class traverse_csa_psi<csa_sada, true>;
108 friend class traverse_csa_psi<csa_sada, false>;
109
110 static const uint32_t linear_decode_limit = 100000;
111
112private:
113 enc_vector_type m_psi; // psi function
114 sa_sample_type m_sa_sample; // suffix array samples
115 isa_sample_type m_isa_sample; // inverse suffix array samples
116 alphabet_type m_alphabet; // alphabet component
117
118 mutable std::vector<uint64_t> m_psi_buf; // buffer for decoded psi values
119
120 void create_buffer()
121 {
122 if (enc_vector_type::sample_dens < linear_decode_limit)
123 {
124 m_psi_buf = std::vector<uint64_t>(enc_vector_type::sample_dens + 1);
125 }
126 }
127
128public:
129 const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
130 const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
131 const typename alphabet_type::C_type & C = m_alphabet.C;
132 const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
133 psi_type const & psi = m_psi;
134 const lf_type lf = lf_type(*this);
135 const bwt_type bwt = bwt_type(*this);
136 const isa_type isa = isa_type(*this);
137 const bwt_type L = bwt_type(*this);
139 const text_type text = text_type(*this);
140 sa_sample_type const & sa_sample = m_sa_sample;
141 isa_sample_type const & isa_sample = m_isa_sample;
142
145 {
146 create_buffer();
147 }
150 {}
151
153 csa_sada(csa_sada const & csa) :
154 m_psi(csa.m_psi),
155 m_sa_sample(csa.m_sa_sample),
156 m_isa_sample(csa.m_isa_sample),
157 m_alphabet(csa.m_alphabet)
158 {
159 create_buffer();
160 m_isa_sample.set_vector(&m_sa_sample);
161 }
162
165 m_psi(std::move(csa.m_psi)),
166 m_sa_sample(std::move(csa.m_sa_sample)),
167 m_isa_sample(std::move(csa.m_isa_sample)),
168 m_alphabet(std::move(csa.m_alphabet))
169 {
170 create_buffer();
171 m_isa_sample.set_vector(&m_sa_sample);
172 }
173
174 csa_sada(cache_config & config);
175
177
183 {
184 return m_psi.size();
185 }
186
188
192 {
193 return t_enc_vec::max_size();
194 }
195
197
200 bool empty() const
201 {
202 return m_psi.empty();
203 }
204
206
210 {
211 return const_iterator(this, 0);
212 }
213
215
219 {
220 return const_iterator(this, size());
221 }
222
224
230 inline value_type operator[](size_type i) const;
231
233
237 {
238 if (this != &csa)
239 {
240 csa_sada tmp(csa);
241 *this = std::move(tmp);
242 }
243 return *this;
244 }
245
247
251 {
252 if (this != &csa)
253 {
254 m_psi = std::move(csa.m_psi);
255 m_sa_sample = std::move(csa.m_sa_sample);
256 m_isa_sample = std::move(csa.m_isa_sample);
257 m_isa_sample.set_vector(&m_sa_sample);
258 m_alphabet = std::move(csa.m_alphabet);
259 m_psi_buf = std::move(csa.m_psi_buf);
260 }
261 return *this;
262 }
263
265 bool operator==(csa_sada const & other) const noexcept
266 {
267 return (m_psi == other.m_psi) && (m_sa_sample == other.m_sa_sample) && (m_isa_sample == other.m_isa_sample)
268 && (m_alphabet == other.m_alphabet);
269 }
270
272 bool operator!=(csa_sada const & other) const noexcept
273 {
274 return !(*this == other);
275 }
276
278
281 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
282
284
286 void load(std::istream & in);
287
288 template <typename archive_t>
289 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
290
291 template <typename archive_t>
292 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
293
294 uint32_t get_sample_dens() const
295 {
296 return t_dens;
297 }
298
299private:
300 // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
301 /*
302 * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
303 * \param c The symbol to count the occurrences in the prefix.
304 * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
305 * \par Time complexity
306 * \f$ \Order{\log n t_{\Psi}} \f$
307 */
308 size_type rank_bwt(size_type i, const char_type c) const
309 {
311 if (cc == 0 and c != 0) // character is not in the text => return 0
312 return 0;
313 if (i == 0)
314 return 0;
315 assert(i <= size());
316
317 size_type lower_b, upper_b; // lower_b inclusive, upper_b exclusive
318
319 const size_type sd = m_psi.get_sample_dens();
320 size_type lower_sb = (C[cc] + sd - 1) / sd; // lower_sb inclusive
321 size_type upper_sb = (C[cc + 1] + sd - 1) / sd; // upper_sb exclusive
322 while (lower_sb + 1 < upper_sb)
323 {
324 size_type mid = (lower_sb + upper_sb) / 2;
325 if (m_psi.sample(mid) >= i)
326 upper_sb = mid;
327 else
328 lower_sb = mid;
329 }
330
331 if (lower_sb == upper_sb)
332 { // the interval was smaller than sd
333 lower_b = C[cc];
334 upper_b = C[cc + 1];
335 }
336 else if (lower_sb > (C[cc] + sd - 1) / sd)
337 { // main case
338 // TODO: don't use get_inter_sampled_values if t_dens is really
339 // large
340 lower_b = lower_sb * sd;
341 if (0 == m_psi_buf.size())
342 {
343 upper_b = std::min(upper_sb * sd, C[cc + 1]);
344 goto finish;
345 }
346 uint64_t * p = m_psi_buf.data();
347 // extract the psi values between two samples
348 m_psi.get_inter_sampled_values(lower_sb, p);
349 p = m_psi_buf.data();
350 uint64_t smpl = m_psi.sample(lower_sb);
351 // handle border cases
352 if (lower_b + m_psi.get_sample_dens() >= C[cc + 1])
353 m_psi_buf[C[cc + 1] - lower_b] = size() - smpl;
354 else
355 m_psi_buf[m_psi.get_sample_dens()] = size() - smpl;
356 // search the result linear
357 while ((*p++) + smpl < i)
358 ;
359
360 return p - 1 - m_psi_buf.data() + lower_b - C[cc];
361 }
362 else
363 { // lower_b == (m_C[cc]+sd-1)/sd and lower_sb < upper_sb
364 if (m_psi.sample(lower_sb) >= i)
365 {
366 lower_b = C[cc];
367 upper_b = lower_sb * sd + 1;
368 }
369 else
370 {
371 lower_b = lower_sb * sd;
372 upper_b = std::min(upper_sb * sd, C[cc + 1]);
373 }
374 }
375 finish:
376 // binary search the interval [C[cc]..C[cc+1]-1] for the result
377 // size_type lower_b = m_C[cc], upper_b = m_C[cc+1]; // lower_b inclusive, upper_b exclusive
378 while (lower_b + 1 < upper_b)
379 {
380 size_type mid = (lower_b + upper_b) / 2;
381 if (m_psi[mid] >= i)
382 upper_b = mid;
383 else
384 lower_b = mid;
385 }
386 if (lower_b > C[cc])
387 return lower_b - C[cc] + 1;
388 else
389 { // lower_b == m_C[cc]
390 return m_psi[lower_b] < i; // 1 if m_psi[lower_b]<i, 0 otherwise
391 }
392 }
393
394 // Calculates the position of the i-th c in the BWT of the original text.
395 /*
396 * \param i The i-th occurrence. \f$i\in [1..rank_bwt(size(),c)]\f$.
397 * \param c Symbol c.
398 * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
399 * \par Time complexity
400 * \f$ \Order{t_{\Psi}} \f$
401 */
402 size_type select_bwt(size_type i, const char_type c) const
403 {
404 assert(i > 0);
406 if (cc == 0 and c != 0) // character is not in the text => return 0
407 return size();
408 assert(cc != 255);
409 if (C[cc] + i - 1 < C[cc + 1])
410 {
411 return m_psi[C[cc] + i - 1];
412 }
413 else
414 return size();
415 }
416};
417
418// == template functions ==
419
420template <class t_enc_vec,
421 uint32_t t_dens,
422 uint32_t t_inv_dens,
423 class t_sa_sample_strat,
424 class t_isa,
425 class t_alphabet_strat>
427{
428 create_buffer();
430 {
431 return;
432 }
433 size_type n = 0;
434 {
437 n = bwt_buf.size();
438 auto event = memory_monitor::event("construct csa-alpbabet");
439 m_alphabet = alphabet_type(bwt_buf, n);
440 }
441 {
442 auto event = memory_monitor::event("sample SA");
443 m_sa_sample = sa_sample_type(config);
444 }
445 {
446 auto event = memory_monitor::event("sample ISA");
447 isa_sample_type isa_s(config, &m_sa_sample);
448 util::swap_support(m_isa_sample, isa_s, &m_sa_sample, (sa_sample_type const *)nullptr);
449 }
450 // if ( config.delete_files ) {
451 // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
452 // }
453
454 int_vector<> cnt_chr(sigma, 0, bits::hi(n) + 1);
455 for (typename alphabet_type::sigma_type i = 0; i < sigma; ++i)
456 {
457 cnt_chr[i] = C[i];
458 }
459 // calculate psi
460 {
461 auto event = memory_monitor::event("construct PSI");
464 std::string psi_file = cache_file_name(conf::KEY_PSI, config);
465 auto psi = write_out_mapper<>::create(psi_file, n, bits::hi(n) + 1);
466 for (size_type i = 0; i < n; ++i)
467 {
468 psi[cnt_chr[char2comp[bwt_buf[i]]]++] = i;
469 }
471 }
472 {
473 auto event = memory_monitor::event("encode PSI");
475 m_psi = t_enc_vec(psi_buf);
476 }
477}
478
479template <class t_enc_vec,
480 uint32_t t_dens,
481 uint32_t t_inv_dens,
482 class t_sa_sample_strat,
483 class t_isa,
484 class t_alphabet_strat>
485inline auto
487 -> value_type
488{
489 size_type off = 0;
490 while (!m_sa_sample.is_sampled(i))
491 { // while i mod t_dens != 0 (SA[i] is not sampled)
492 i = psi[i]; // go to the position where SA[i]+1 is located
493 ++off; // add 1 to the offset
494 }
495 value_type result = m_sa_sample[i];
496 if (result < off)
497 {
498 return m_psi.size() - (off - result);
499 }
500 else
501 return result - off;
502}
503
504template <class t_enc_vec,
505 uint32_t t_dens,
506 uint32_t t_inv_dens,
507 class t_sa_sample_strat,
508 class t_isa,
509 class t_alphabet_strat>
511 std::ostream & out,
513 std::string name) const -> size_type
514{
515 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
516 size_type written_bytes = 0;
517 written_bytes += m_psi.serialize(out, child, "psi");
518 written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
519 written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
520 written_bytes += m_alphabet.serialize(out, child, "alphabet");
521 structure_tree::add_size(child, written_bytes);
522 return written_bytes;
523}
524
525template <class t_enc_vec,
526 uint32_t t_dens,
527 uint32_t t_inv_dens,
528 class t_sa_sample_strat,
529 class t_isa,
530 class t_alphabet_strat>
532{
533 m_psi.load(in);
534 m_sa_sample.load(in);
535 m_isa_sample.load(in, &m_sa_sample);
536 m_alphabet.load(in);
537}
538
539template <class t_enc_vec,
540 uint32_t t_dens,
541 uint32_t t_inv_dens,
542 class t_sa_sample_strat,
543 class t_isa,
544 class t_alphabet_strat>
545template <typename archive_t>
547 archive_t & ar) const
548{
549 ar(CEREAL_NVP(m_psi));
550 ar(CEREAL_NVP(m_sa_sample));
551 ar(CEREAL_NVP(m_isa_sample));
552 ar(CEREAL_NVP(m_alphabet));
553}
554
555template <class t_enc_vec,
556 uint32_t t_dens,
557 uint32_t t_inv_dens,
558 class t_sa_sample_strat,
559 class t_isa,
560 class t_alphabet_strat>
561template <typename archive_t>
563 archive_t & ar)
564{
565 ar(CEREAL_NVP(m_psi));
566 ar(CEREAL_NVP(m_sa_sample));
567 ar(CEREAL_NVP(m_isa_sample));
568 m_isa_sample.set_vector(&m_sa_sample);
569 ar(CEREAL_NVP(m_alphabet));
570}
571
572} // end namespace sdsl
573#endif
bits.hpp contains the sdsl::bits class.
cereal.hpp offers cereal support
#define CEREAL_NVP(X)
Definition cereal.hpp:31
#define CEREAL_LOAD_FUNCTION_NAME
Definition cereal.hpp:34
#define CEREAL_SAVE_FUNCTION_NAME
Definition cereal.hpp:35
A wrapper for the bwt of a compressed suffix array that is based on the function.
A class for the Compressed Suffix Array (CSA) proposed by Sadakane for practical implementation.
Definition csa_sada.hpp:59
t_isa_sample_strat::template type< csa_sada > isa_sample_type
Definition csa_sada.hpp:96
alphabet_type::string_type string_type
Definition csa_sada.hpp:101
uint32_t get_sample_dens() const
Definition csa_sada.hpp:294
alphabet_type::alphabet_category alphabet_category
Definition csa_sada.hpp:98
csa_tag index_category
Definition csa_sada.hpp:104
isa_of_csa_psi< csa_sada > isa_type
Definition csa_sada.hpp:92
bwt_of_csa_psi< csa_sada > bwt_type
Definition csa_sada.hpp:91
const alphabet_type::comp2char_type & comp2char
Definition csa_sada.hpp:130
const first_row_type F
Definition csa_sada.hpp:138
const bwt_type bwt
Definition csa_sada.hpp:135
t_enc_vec enc_vector_type
Definition csa_sada.hpp:88
const_iterator begin() const
Returns a const_iterator to the first element.
Definition csa_sada.hpp:209
const_reference reference
Definition csa_sada.hpp:82
const isa_type isa
Definition csa_sada.hpp:136
csa_sada & operator=(csa_sada &&csa)
Assignment Move Operator.
Definition csa_sada.hpp:250
csa_sada csa_type
Definition csa_sada.hpp:102
t_sa_sample_strat::template type< csa_sada > sa_sample_type
Definition csa_sada.hpp:95
const bwt_type L
Definition csa_sada.hpp:137
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition csa_sada.hpp:510
bool operator!=(csa_sada const &other) const noexcept
Inequality operator.
Definition csa_sada.hpp:272
alphabet_type::comp_char_type comp_char_type
Definition csa_sada.hpp:99
bool empty() const
Returns if the data strucutre is empty.
Definition csa_sada.hpp:200
static size_type max_size()
Returns the largest size that csa_sada can ever have.
Definition csa_sada.hpp:191
sa_sample_type const & sa_sample
Definition csa_sada.hpp:140
static const uint32_t linear_decode_limit
Definition csa_sada.hpp:110
csa_sada(csa_sada &&csa)
Move constructor.
Definition csa_sada.hpp:164
value_type operator[](size_type i) const
[]-operator
Definition csa_sada.hpp:486
const_reference * pointer
Definition csa_sada.hpp:83
const_iterator iterator
Definition csa_sada.hpp:80
bool operator==(csa_sada const &other) const noexcept
Equality operator.
Definition csa_sada.hpp:265
uint64_t value_type
Definition csa_sada.hpp:78
t_alphabet_strat alphabet_type
Definition csa_sada.hpp:97
void load(std::istream &in)
Load from a stream.
Definition csa_sada.hpp:531
const lf_type lf
Definition csa_sada.hpp:134
~csa_sada()
Default Destructor.
Definition csa_sada.hpp:149
text_of_csa< csa_sada > text_type
Definition csa_sada.hpp:93
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Definition csa_sada.hpp:546
first_row_of_csa< csa_sada > first_row_type
Definition csa_sada.hpp:94
isa_sample_type const & isa_sample
Definition csa_sada.hpp:141
int_vector ::size_type size_type
Definition csa_sada.hpp:85
size_type size() const
Number of elements in the .
Definition csa_sada.hpp:182
alphabet_type::char_type char_type
Definition csa_sada.hpp:100
ptrdiff_t difference_type
Definition csa_sada.hpp:87
csa_sada & operator=(csa_sada const &csa)
Assignment Copy Operator.
Definition csa_sada.hpp:236
const pointer const_pointer
Definition csa_sada.hpp:84
random_access_const_iterator< csa_sada > const_iterator
Definition csa_sada.hpp:79
enc_vector_type psi_type
Definition csa_sada.hpp:89
const text_type text
Definition csa_sada.hpp:139
const alphabet_type::C_type & C
Definition csa_sada.hpp:131
const alphabet_type::char2comp_type & char2comp
Definition csa_sada.hpp:129
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition csa_sada.hpp:218
traverse_csa_psi< csa_sada, false > lf_type
Definition csa_sada.hpp:90
size_type csa_size_type
Definition csa_sada.hpp:86
psi_type const & psi
Definition csa_sada.hpp:133
const value_type const_reference
Definition csa_sada.hpp:81
const alphabet_type::sigma_type & sigma
Definition csa_sada.hpp:132
psi_tag extract_category
Definition csa_sada.hpp:105
csa_sada(csa_sada const &csa)
Copy constructor.
Definition csa_sada.hpp:153
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Definition csa_sada.hpp:562
csa_sada()
Default Constructor.
Definition csa_sada.hpp:144
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition io.hpp:36
int_vector_size_type size_type
static mm_event_proxy event(std::string const &name)
Generic iterator for a random access container.
Definition iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, std::string const &name, std::string const &type)
static void add_size(structure_tree_node *v, uint64_t value)
static int_vector_mapper< t_width > create(std::string const &key, cache_config &config)
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
enc_vector.hpp contains the sdsl::enc_vector class.
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
iterators.hpp contains an generic iterator for random access containers.
memory_tracking.hpp contains two function for allocating and deallocating memory
constexpr char KEY_PSI[]
Definition config.hpp:42
Namespace for the succinct data structure library.
bool cache_file_exists(std::string const &key, cache_config const &config)
Checks if the resource specified by the key exists in the cache.
Definition io.hpp:733
std::string cache_file_name(std::string const &key, cache_config const &config)
Returns the file name of the resource.
Definition io.hpp:688
void register_cache_file(std::string const &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition io.hpp:717
constexpr char const * key_bwt()
Contains declarations and definitions of data structure concepts.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:653
Helper class for construction process.
Definition config.hpp:66
structure_tree.hpp contains a helper class which can represent the memory structure of a class.
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.