Matrix Science header

ms_mascotresprotein.hpp

00001 /*
00002 ##############################################################################
00003 # file: ms_mascotresprotein.hpp                                              #
00004 # 'msparser' toolkit                                                         #
00005 # Encapsulates a protein - either for protein summary or peptide summary     #
00006 ##############################################################################
00007 # COPYRIGHT NOTICE                                                           #
00008 # Copyright 1998-2002 Matrix Science Limited  All Rights Reserved.           #
00009 #                                                                            #
00010 ##############################################################################
00011 #    $Archive:: /Mowse/ms_mascotresfile/include/ms_mascotresprotein.hpp    $ #
00012 #     $Author: francoisr $ #
00013 #       $Date: 2017/03/06 09:59:18 $ #
00014 #   $Revision: 1.66 $ #
00015 # $NoKeywords::                                                            $ #
00016 ##############################################################################
00017 */
00018 
00019 #ifndef MS_MASCOTRESPROTEIN_HPP
00020 #define MS_MASCOTRESPROTEIN_HPP
00021 
00022 #ifdef _WIN32
00023 #pragma warning(disable:4251)   // Don't want all classes to be exported
00024 #pragma warning(disable:4786)   // Debug symbols too long
00025 #   ifndef _MATRIX_USE_STATIC_LIB
00026 #       ifdef MS_MASCOTRESFILE_EXPORTS
00027 #           define MS_MASCOTRESFILE_API __declspec(dllexport)
00028 #       else
00029 #           define MS_MASCOTRESFILE_API __declspec(dllimport)
00030 #       endif
00031 #   else
00032 #       define MS_MASCOTRESFILE_API
00033 #   endif
00034 #else
00035 #   define MS_MASCOTRESFILE_API
00036 #endif
00037 
00038 // for the sake of #include <string>
00039 #ifdef __ALPHA_UNIX__
00040 #include <ctype.h>
00041 #endif
00042 
00043 // Includes from the standard template library
00044 #include <string>
00045 #include <list>
00046 #include <vector>
00047 #include <set>
00048 #include <map>
00049 
00050 
00051 namespace matrix_science {
00052 
00053     class ms_mascotresults;
00054     class ms_proteinsummary;
00055     class ms_pepinfoSortByScore;
00056 
00062 
00063 
00069     class MS_MASCOTRESFILE_API ms_protein
00070     {
00071         public:
00073 
00086             enum GROUP 
00087             { 
00088                 GROUP_UNKNOWN,  
00089                 GROUP_NO,       
00090                 GROUP_SUBSET,   
00091                 GROUP_COMPLETE, 
00092                 GROUP_FAMILY    
00093             };
00094 
00095 #ifdef DUPLICATE
00096     #ifdef _WIN32
00097 //        #pragma message("WARNING: The identifier 'DUPLICATE' was defined but is incompatible with the definition for ms_protein")
00098     #endif
00099     #undef DUPLICATE
00100 #endif
00101 
00103 
00110             enum DUPLICATE 
00111             { 
00112                 DUPE_NotDuplicate,           
00113                 DUPE_Duplicate,              
00114                 DUPE_DuplicateSameQuery,     
00115                 DUPE_HighestScoringDuplicate, 
00116                 DUPE_Ignored                 
00117             };
00118 
00120 
00129             enum MASS_FLAGS
00130             {
00131                 MASS_NON_SELECT_NON_MATCH   = 0x0001, 
00132                 MASS_SELECT_NON_MATCH       = 0x0010, 
00133                 MASS_NON_SELECT_MATCH       = 0x0100, 
00134                 MASS_SELECT_MATCH           = 0x1000  
00135             };
00136 
00138 
00199             enum DISTINCT_PEPTIDE_FLAGS
00200             {
00201                 DPF_SEQUENCE        = 0x0001,  
00202                 DPF_CHARGE          = 0x0002,  
00203                 DPF_MODS            = 0x0004,  
00204                 DPF_UNIQUE          = 0x0008,  
00205                 DPF_NODUPSAMEQUERY  = 0x0010   
00206             };
00207 
00208             // Types for uniquely identifying a protein
00209             typedef std::pair<int, std::string> dbIdxPlusAcc_t;
00210             typedef std::vector<dbIdxPlusAcc_t> dbIdxPlusAccVect_t;
00211             typedef std::set<dbIdxPlusAcc_t>    dbIdxPlusAccSet_t;
00212 
00213 
00215             ms_protein(const double score,
00216                        const std::string accession,
00217                        const bool updateScoreFromPepScores,
00218                        const int  proteinSummaryHit = 0);
00219 
00221             ms_protein(const ms_protein& src);
00222 
00224             ~ms_protein();
00225 
00226 #ifndef SWIG
00227 
00228             ms_protein& operator=(const ms_protein& right);
00229 #endif
00230 
00232             void copyFrom(const ms_protein* src);
00233 
00235             std::string getAccession() const;
00236 
00238             int getDB() const;
00239 
00241             void setDB(int dbIdx);
00242 
00244             double getScore()          const;
00245 
00247             double getNonMudpitScore()     const;
00248 
00250             double getScoreWithET() const;
00251 
00253             int    getNumPeptides()    const;
00254 
00256             int    getNumDisplayPeptides(bool aboveThreshold = false)    const;
00257 
00259             GROUP getGrouping() const;
00260 
00261 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00262 
00263             void setGrouping(GROUP g)         { group_ = g;       }
00264 
00266             std::string getForCache(dbIdxPlusAccVect_t & supersetProteinsUnsorted,
00267                                     dbIdxPlusAccVect_t & components) const;
00268 
00270             bool setFromCache(const std::string & str, ms_mascotresults & results,
00271                               const dbIdxPlusAccVect_t & supersetProteinsUnsorted,
00272                               const dbIdxPlusAccVect_t & components,
00273                               const std::string & cdbFeatures);
00274 
00276             std::vector<std::pair<int, int> > getIgnoredQPs() const;
00277 
00279             bool isIgnoredQP(const int q, const int p) const;
00280 #endif
00281 
00283             void getIgnoredQPs(std::vector<int> &q, std::vector<int> &p) const;
00284 
00286             int    getPeptideQuery         (const int   pepNumber) const;
00287 
00289             int    getPeptideP             (const int   pepNumber) const;
00290 
00292             int getPepNumber(const int q, const int p) const;
00293 
00295             int    getPeptideFrame         (const int   pepNumber) const;
00296 
00298             long   getPeptideStart         (const int   pepNumber) const;
00299 
00301             long   getPeptideEnd           (const int   pepNumber) const;
00302 
00304             long   getPeptideMultiplicity  (const int   pepNumber) const;
00305 
00307             DUPLICATE getPeptideDuplicate  (const int   pepNumber) const;
00308 
00310             double getPeptideIonsScore     (const int   pepNumber) const;
00311             
00313             bool   getPeptideIsBold        (const int   pepNumber) const;
00314             
00316             void   setPeptideIsBold        (const int   pepNumber);
00317 
00319             bool   getPeptideShowCheckbox  (const int   pepNumber) const;
00320             
00322             void   setPeptideShowCheckbox  (const int   pepNumber);
00323             
00325             int    getPeptideComponentID   (const int   pepNumber) const;
00326             
00328             char   getPeptideResidueBefore (const int   pepNumber) const;
00329 
00331             char   getPeptideResidueAfter  (const int   pepNumber) const;
00332 
00334             bool isASimilarProtein(const ms_protein       * prot, 
00335                                    const ms_mascotresults * results,
00336                                    const bool groupByQueryNumber = false);
00337 
00339             std::string getSimilarProteinName() const;
00340 
00342             int getSimilarProteinDB() const;
00343 
00345             bool isSimilarProtein(const std::string & acc, const int dbIdx) const;
00346 
00348             int getSimilarProteins(std::vector<std::string> & accessions, std::vector<int> & dbIdxs) const;
00349 
00351             void setSimilarProtein(const ms_protein * prot);
00352 
00355             void addOnePeptide(      ms_mascotresults & results,
00356                                const int frame,
00357                                const long start, const long end, 
00358                                const long multiplicity,
00359                                const int q, const int p,
00360                                const double correctedScore,
00361                                const double uncorrectedScore,
00362                                const char residueBefore,
00363                                const char residueAfter,
00364                                const ms_protein * component,
00365                                const ms_peptide::SEARCH_PHASE searchPhase,
00366                                const bool isIgnored);
00367 
00368 
00370             long getCoverage() const;
00371 
00373             bool anyMatchToQuery(const int query) const;
00374 
00376             bool anyMatchToQueryAndP(const int query, const int P) const;
00377 
00379             std::string getUnmatchedMasses(ms_mascotresfile & resfile,
00380                                            const int numDecimalPlaces = 2) const;
00381 
00383             std::string getMasses(ms_mascotresfile & resfile,
00384                                   const ms_proteinsummary & summary,
00385                                   const unsigned int flags = MASS_SELECT_MATCH,
00386                                   const int numDecimalPlaces = 2) const;
00387 
00389             int getFrame() const;
00390 
00392             bool anyBoldRedPeptides(const ms_mascotresults & results) const;
00393 
00395             bool isUnigene() const;
00396 
00398             void setIsUnigeneEntry();
00399 
00401             bool isPMFMixture() const;
00402 
00404             void setIsPMFMixture();
00405 
00407             void sortPeptides(const ms_mascotresults & results, bool keepAlive = false, int keepAlivePercent = 0, const char * keepAliveAccession = "", int keepAliveCount = 0);
00408 
00410             int getNumComponents() const;
00411 
00413             const ms_protein * getComponent(const int componentNumber) const;
00414 
00416             int getProteinSummaryHit() const;
00417 
00419             double getRMSDeltas(const ms_mascotresults & results) const;
00420 
00422             int getHitNumber() const;
00423 
00425             void setHitNumber(const int hit) { hitNum_ = hit;}
00426 
00428             int getMemberNumber() const;
00429 
00431             int getLongestPeptideLen() const;
00432 
00434             int getNumDistinctPeptides(bool aboveThreshold = false,
00435                                        DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
00436 
00438             int getNumDistinctPeptideRepeats(
00439                     int distinctIndex, // 1..getNumDistinctPeptides
00440                     bool aboveThreshold = false,
00441                     DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
00442 
00444             ms_peptide getDistinctPeptide(
00445                     int distinctIndex, // 1..getNumDistinctPeptides
00446                     int repeatIndex = 1, // 1..getNumDistinctPeptideRepeats
00447                     bool aboveThreshold = false,
00448                     DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
00449 
00451             int getLongestSigPeptideLen() const;
00452 
00454             int getNumObservedForEmPAI() const;
00455 
00457 
00466             friend inline bool operator<(const ms_protein & lhs, const ms_protein & rhs) { 
00467                 if (lhs.dbIdx_ == rhs.dbIdx_) {
00468                     if ( lhs.proteinSummaryHit_ == 0 ) {
00469                         return lhs.accession_ < rhs.accession_;
00470                     } else { // i.e ms_proteinsummary - see parser bug 493
00471                         if ( lhs.accession_ == rhs.accession_) {
00472                             return lhs.getFrame() < rhs.getFrame();
00473                         } else {
00474                             return lhs.accession_ < rhs.accession_;
00475                         }
00476                     }
00477                 } else {
00478                     return lhs.dbIdx_ < rhs.dbIdx_; 
00479                 }
00480             }
00481 
00482             // Undocumented function for fast access
00483             const char * getAccessionStr() const { return accession_.c_str(); }
00484 
00485         private:
00486             // For each peptide, we have frame, start, end multiplicity
00487             // and we want to just have a reference to the peptide
00488             // structures - using query and 'p', where p is 1..10
00489             // or p=1..50 for PMF or 1..128 (ish) for PMF mixtures.
00490             // See bug 11305 for mistake with number of bits for 'p'
00491             //
00492             // If you change this, then check to see if you need to 
00493             // change getForCache() and setFromCache()
00494             typedef struct
00495             {
00496                 double      ionsScore;
00497                 double      uncorScore;
00498                 double      sortingScore;
00499                 int         start;
00500                 int         end;
00501                 long        multiplicity;
00502                 int         query;
00503                 unsigned int p:8;
00504                 int         frame:5;
00505                 int         componentID;
00506                 DUPLICATE   duplicate:3;
00507                 bool        ignored:1;
00508                 short       dupeStatus:4;
00509                 bool        bold:1;
00510                 bool        checkBox:1;
00511                 unsigned int searchPhase:2;
00512                 DUPLICATE   nonETduplicate:4;
00513                 short       nonETdupeStatus:4;
00514                 char        residueBefore; // :8;
00515                 char        residueAfter; //:8;
00516             } PEPINFO;
00517 
00518             void initialiseDistinctPeptideTree(
00519                     bool  aboveThreshold,
00520                     DISTINCT_PEPTIDE_FLAGS flags) const;
00521 
00522 
00523             // --- Start of uncached variables
00524             mutable std::vector<PEPINFO *> peptides_; // sort by query
00525             mutable std::vector<PEPINFO *> ignoredPeptides_;
00526             mutable std::vector<PEPINFO> allPeptides_;
00527 
00528             ms_mascotresults * results_;
00529 //          bool loadedFromCache_;
00530             // --- End of uncached variables
00531 
00532             // Start of all cached variables
00533             unsigned char    flags_; // See FL_... not all bits are cached
00534 
00535             int numPeptides_;  // Only used if loadedFromCache_ is true - otherwise peptides_.size();
00536             mutable int numDisplayPeptides_;
00537             mutable int numDisplayPeptidesAboveThresh_;
00538             mutable int numDistinctPeptides_;
00539             mutable int numDistinctPeptidesAboveThresh_;
00540             mutable int numDistinctUniquePeptides_;
00541             mutable int numDistinctUniqPepAboveThresh_;
00542             mutable int lenLongestPeptideAboveThresh_;
00543             mutable int numObservedForEmPAI_;
00544             mutable int frame_;
00545             mutable bool distinctPeptideAboveThreshold_;
00546             mutable DISTINCT_PEPTIDE_FLAGS distinctPeptideFlags_;
00547             mutable std::list<std::list<ms_peptide*> > distinctPeptideTree_;
00548             dbIdxPlusAccSet_t  supersetProteins_;  // This one is filled when loading from cache
00549             dbIdxPlusAccVect_t supersetProteinsUnsorted_;
00550 
00551             // For unignene and PMF mixture, the protein is really a 'pseudo'
00552             // protein, made up from a number of 'real' proteins
00553             dbIdxPlusAccVect_t components_;
00554 
00555             std::string accession_;
00556             int dbIdx_;
00557             double score_;
00558             double nonMudPITScore_;
00559             double scoreWithET_;
00560             GROUP group_;
00561             int proteinSummaryHit_;
00562             int hitNum_;
00563             mutable int memberNum_;
00564             int longestPeptideLen_;         // Useful with minPepLenInPepSummary
00565             mutable long coverage_;
00566 //          bool pmfMixture_;               // True if protein actually originates from a PMF mixture
00567 //          bool sorted_;                   // Sorting the list of peptides is expensive - don't repeat...
00568 //          bool unigene_;                  // For unigene, we need to get the description line from the unigene file
00569 //          bool updateScoreFromPepScores_; // For protein summary, the protein score is calculated by
00570                                             // nph-mascot.exe, and is in the results file. For the
00571                                             // peptide summary, the score is calculated by adding the ions
00572                                             // scores
00573 
00574             // Functions
00575             void copyPeptidePointers(std::vector<PEPINFO *> &pointersTo, const std::vector<PEPINFO *> &pointersFrom, const ms_protein *src);
00576             void checkFromCache(const char * calledBy) const;
00577             void checkQPFromCache(const char * calledBy) const;
00578             bool isFlagSet(unsigned char fl) const { return (flags_ & fl)?true:false; }
00579             void setFlag(unsigned char fl, bool val) {
00580                 if (val) {
00581                     flags_ |= fl;
00582                 } else {
00583                     flags_ &= ~fl; 
00584                 }
00585             }
00586 
00587             static bool isVarModStrEmpty(const std::string &str);
00588 
00589             friend class prot_sort;
00590             friend class ms_pepinfoSortByScore;
00591     };
00592 #ifndef SWIG
00593     // Helper class - don't use from outside library
00594     class ms_proteinPtrSortByAccession
00595     {
00596         public:
00597             bool operator() (const ms_protein * p1, const ms_protein * p2) const {
00598                 return (*p1 < *p2);
00599             }
00600     };
00601 
00602     class ms_proteinPtrSortByScore
00603     {
00604         public:
00605             bool operator() (const ms_protein * p1, const ms_protein * p2) const {
00606                 if (p1->getScore() != p2->getScore()) {
00607                     return (p1->getScore() > p2->getScore());
00608                 } else {
00609                     return (*p1 < *p2);
00610                 }
00611             }
00612     };
00613 
00614 
00615     class ms_pepinfoSortByScore
00616     {
00617     public:
00618         ms_pepinfoSortByScore(std::pair<bool, bool> pairParam): removeDiffPos_(pairParam.first), anyLibraryMatches_(pairParam.second) { }
00619         bool operator() (const ms_protein::PEPINFO * p1, const ms_protein::PEPINFO * p2) const;
00620         ms_pepinfoSortByScore(const ms_pepinfoSortByScore& other): removeDiffPos_(other.removeDiffPos_), anyLibraryMatches_(other.anyLibraryMatches_){}
00621         ms_pepinfoSortByScore& operator=(const ms_pepinfoSortByScore& other)
00622         {
00623             if (&other != this) {
00624                 removeDiffPos_ = other.removeDiffPos_;
00625                 anyLibraryMatches_ = other.anyLibraryMatches_;
00626             }
00627             return *this;
00628         }
00629 
00630     private:
00631         bool removeDiffPos_;
00632         bool anyLibraryMatches_;
00633     };
00634 
00635 #endif
00636  // end of resfile_group
00637 }   // matrix_science namespace
00638 
00639 #endif // MS_MASCOTRESPROTEIN_HPP
00640 
00641 /*------------------------------- End of File -------------------------------*/
00642 
00643 
00644 
00645 
Copyright © 2016 Matrix Science Ltd.  All Rights Reserved. Generated on Fri Jun 2 2017 01:44:51