RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RD_FPBREADER_H_DEC2015
11 #define RD_FPBREADER_H_DEC2015
12 /*! \file FPBReader.h
13 
14  \brief contains a simple class for reading and searching FPB files
15 
16  \b Note that this functionality is experimental and the API may change
17  in future releases.
18 */
19 
20 #include <iostream>
21 #include <fstream>
22 #include <sstream>
23 #include <string>
26 
27 #include <boost/cstdint.hpp>
28 #include <boost/shared_ptr.hpp>
29 #include <boost/shared_array.hpp>
30 
31 namespace RDKit {
32 namespace detail {
33 struct FPBReader_impl;
34 }
35 
36 //! class for reading and searching FPB files
37 /*!
38  basic usage:
39  \code
40  FPBReader reader("foo.fpb");
41  reader.init();
42  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
43  std::vector<std::pair<double, unsigned int> > nbrs =
44  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
45  \endcode
46 
47  \b Note: this functionality is experimental and the API may change
48  in future releases.
49 
50  <b>Note on thread safety</b>
51  Operations that involve reading from the FPB file are not thread safe.
52  This means that the \c init() method is not thread safe and none of the
53  search operations are thread safe when an \c FPBReader is initialized in
54  \c lazyRead mode.
55 
56 */
57 class FPBReader {
58  public:
60  : dp_istrm(NULL),
61  dp_impl(NULL),
62  df_owner(false),
63  df_init(false),
64  df_lazyRead(false){};
65  //! ctor for reading from a named file
66  /*!
67  \param fname the name of the file to reads
68  \param lazyRead if set to \c false all fingerprints from the file will be read
69  into memory when \c init() is called.
70  */
71  FPBReader(const char *fname, bool lazyRead = false) {
72  _initFromFilename(fname, lazyRead);
73  };
74  //! \overload
75  FPBReader(const std::string &fname, bool lazyRead = false) {
76  _initFromFilename(fname.c_str(), lazyRead);
77  };
78  //! ctor for reading from an open istream
79  /*!
80  \param inStream the stream to read from
81  \param takeOwnership if set, we will take over ownership of the stream pointer
82  \param lazyRead if set to \c false all fingerprints from the file will be read
83  into memory when \c init() is called.
84 
85  Some additional notes:
86  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
87  tellg() operations.
88 
89  */
90  FPBReader(std::istream *inStream, bool takeOwnership = true,
91  bool lazyRead = false)
92  : dp_istrm(inStream),
93  df_owner(takeOwnership),
94  df_init(false),
95  df_lazyRead(lazyRead){};
97  destroy();
98  if (df_owner) delete dp_istrm;
99  dp_istrm = NULL;
100  df_init = false;
101  };
102 
103  //! Read the data from the file and initialize internal data structures
104  /*!
105  This must be called before most of the other methods of this clases.
106 
107  Some notes:
108  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
109  can require substantial amounts of memory for large files.
110  \li For large files, this can take a long time.
111  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
112  and delete inStream after calling \c init()
113  */
114  void init();
115  //! returns the requested fingerprint as an \c ExplicitBitVect
116  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
117  //! returns the requested fingerprint as an array of bytes
118  boost::shared_array<boost::uint8_t> getBytes(unsigned int idx) const;
119 
120  //! returns the id of the requested fingerprint
121  std::string getId(unsigned int idx) const;
122  //! returns the fingerprint and id of the requested fingerprint
123  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
124  unsigned int idx) const {
125  return std::make_pair(getFP(idx), getId(idx));
126  };
127 
128  //! returns beginning and end indices of fingerprints having on-bit counts
129  //! within the range (including end points)
130  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
131  unsigned int minCount, unsigned int maxCount);
132 
133  //! returns the number of fingerprints
134  unsigned int length() const;
135  //! returns the number of bits in our fingerprints
136  unsigned int nBits() const;
137 
138  //! returns the tanimoto similarity between the specified fingerprint and the
139  //! provided fingerprint
140  double getTanimoto(unsigned int idx, const boost::uint8_t *bv) const;
141  //! \overload
142  double getTanimoto(unsigned int idx,
143  boost::shared_array<boost::uint8_t> bv) const {
144  return getTanimoto(idx, bv.get());
145  };
146  //! \overload
147  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
148 
149  //! returns tanimoto neighbors that are within a similarity threshold
150  /*!
151  The result vector of (similarity,index) pairs is sorted in order
152  of decreasing similarity
153 
154  \param bv the query fingerprint
155  \param threshold the minimum similarity to return
156  \param usePopcountScreen if this is true (the default) the popcount of the
157  neighbors will be used to reduce the number of calculations that need
158  to be done
159 
160  */
161  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
162  const boost::uint8_t *bv, double threshold = 0.7,
163  bool usePopcountScreen = true) const;
164  //! \overload
165  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
166  boost::shared_array<boost::uint8_t> bv, double threshold = 0.7,
167  bool usePopcountScreen = true) const {
168  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
169  };
170  //! \overload
171  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
172  const ExplicitBitVect &ebv, double threshold = 0.7,
173  bool usePopcountScreen = true) const;
174 
175  //! returns the Tversky similarity between the specified fingerprint and the
176  //! provided fingerprint
177  /*!
178 
179  \param idx the fingerprint to compare to
180  \param bv the query fingerprint
181  \param ca the Tversky a coefficient
182  \param cb the Tversky a coefficient
183 
184  */
185  double getTversky(unsigned int idx, const boost::uint8_t *bv, double ca,
186  double cb) const;
187  //! \overload
188  double getTversky(unsigned int idx, boost::shared_array<boost::uint8_t> bv,
189  double ca, double cb) const {
190  return getTversky(idx, bv.get(), ca, cb);
191  };
192  //! \overload
193  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
194  double cb) const;
195 
196  //! returns Tversky neighbors that are within a similarity threshold
197  /*!
198  The result vector of (similarity,index) pairs is sorted in order
199  of decreasing similarity
200 
201  \param bv the query fingerprint
202  \param ca the Tversky a coefficient
203  \param cb the Tversky a coefficient
204  \param threshold the minimum similarity to return
205  \param usePopcountScreen if this is true (the default) the popcount of the
206  neighbors will be used to reduce the number of calculations that need
207  to be done
208 
209  */
210  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
211  const boost::uint8_t *bv, double ca, double cb, double threshold = 0.7,
212  bool usePopcountScreen = true) const;
213  //! \overload
214  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
215  boost::shared_array<boost::uint8_t> bv, double ca, double cb,
216  double threshold = 0.7, bool usePopcountScreen = true) const {
217  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
218  };
219  //! \overload
220  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
221  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
222  bool usePopcountScreen = true) const;
223 
224  //! returns indices of all fingerprints that completely contain this one
225  /*! (i.e. where all the bits set in the query are also set in the db
226  molecule)
227  */
228  std::vector<unsigned int> getContainingNeighbors(
229  const boost::uint8_t *bv) const;
230  //! \overload
231  std::vector<unsigned int> getContainingNeighbors(
232  boost::shared_array<boost::uint8_t> bv) const {
233  return getContainingNeighbors(bv.get());
234  };
235  //! \overload
236  std::vector<unsigned int> getContainingNeighbors(
237  const ExplicitBitVect &ebv) const;
238 
239  private:
240  std::istream *dp_istrm;
241  detail::FPBReader_impl *dp_impl; // implementation details
242  bool df_owner;
243  bool df_init;
244  bool df_lazyRead;
245 
246  // disable automatic copy constructors and assignment operators
247  // for this class and its subclasses. They will likely be
248  // carrying around stream pointers and copying those is a recipe
249  // for disaster.
250  FPBReader(const FPBReader &);
251  FPBReader &operator=(const FPBReader &);
252  void destroy();
253  void _initFromFilename(const char *fname, bool lazyRead) {
254  std::istream *tmpStream = static_cast<std::istream *>(
255  new std::ifstream(fname, std::ios_base::binary));
256  if (!tmpStream || (!(*tmpStream)) || (tmpStream->bad())) {
257  std::ostringstream errout;
258  errout << "Bad input file " << fname;
259  throw BadFileException(errout.str());
260  }
261  dp_istrm = tmpStream;
262  df_owner = true;
263  df_init = false;
264  df_lazyRead = lazyRead;
265  }
266 };
267 }
268 #endif
class for reading and searching FPB files
Definition: FPBReader.h:57
double getTanimoto(unsigned int idx, boost::shared_array< boost::uint8_t > bv) const
Definition: FPBReader.h:142
used by various file parsing classes to indicate a bad file
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:90
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< boost::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
Definition: FPBReader.h:214
FPBReader(const std::string &fname, bool lazyRead=false)
Definition: FPBReader.h:75
Definition: types.h:23
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:123
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< boost::uint8_t > bv) const
Definition: FPBReader.h:231
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:28
double getTversky(unsigned int idx, boost::shared_array< boost::uint8_t > bv, double ca, double cb) const
Definition: FPBReader.h:188
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< boost::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
Definition: FPBReader.h:165
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:71
a class for bit vectors that are densely occupied