Commit 1b16e862 authored by Denys Bulavka's avatar Denys Bulavka
Browse files

First commit

parent 33dac76c
# elm_processing
Database description:
The database was downloaded on the 22nd of May of 2015, named elm_original_20150522.csv, version history in http://elm.eu.org/infos/news.html. This file can be found in the folder 'database'. This file has 210 motifs.
We make the following simplifications:
^(at protein start) - we ignore this
$ (at end of protein) - we ignore this
| OR - take the shorter version
{0,1} - take the shorter version
(A) aminoacid modification - we ignore this
We also ignore the following ones:
*TRG_PTS2 ^.{1,40}R[^P][^P][^P][LIV][^P][^P][HQ][LIF]
*LIG_PCNA_PIPBox_1 ((^.{0,3})|(Q)).[^FHWY][ILM][^P][^FHILVWYP][HFM][FMY]..
This generated a list of 208 motifs. The file we will be working with is "database/elm_input.txt".
Analysis description:
We will take the following approach to the analysis:
Trim start and end positions with more than 10 characteres.
We identified motifs that:
1. Have the same biological role
2. Described as "minor variants"
The file 'database/marked_motifs.txt' has the list of motifs we will not take into account from elm_input_modif.txt.
Software:
Elm_processing:
Compile the code:
cd elm_processing
make
After compilation the binary will be placed in the folder elm_processing/bin.
Generate files for python sripts:
The corresponding input files for python scripts are already generated. But if needed, they can be generated using the following commands from the directory elm_processing/bin
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 6 >> structure_empiric_frequency.txt
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 7 >> structure_theoretic_probabilities.txt
Other available reports:
Number of aminoacids per coordinate:
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 1
Aminoacids per coordinate:
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 2
Number of k discriminant motifs:
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 3 -k NUMBER
Pairs of k discriminant motifs:
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 4 -k NUMBER
For each k, outputs the number of pairs of motifs with k discriminant positions:
./main -d ../../database/elm_input.txt -c -m ../../database/marked_motifs.txt -r 5
Python scripts:
Each python script has an input directoy and an output directory. To excecute the each python script it is enought to be inside the script folder and run 'python main.py' which will generate the output and place it in output directory. The rule is the script that has "empiric" in is name should have "structure_empiric_frequency.txt" in its input folder, while the script that has "theoretic" in its name should have "structure_theoretic_probabilities.txt" in its input folder.
Software for processing elm database
\ No newline at end of file
CLV_C14_Caspase3-7 [DSTE][^P][^DEWHFYC]D[GSAN]
CLV_MEL_PAP_1 [ILV]..R[VF][GS].
CLV_NRD_NRD_1 (.RK)|(RR[^KR])
CLV_PCSK_FUR_1 R.[RK]R.
CLV_PCSK_KEX2_1 [KR]R.
CLV_PCSK_PC1ET2_1 KR.
CLV_PCSK_PC7_1 R...[KR]R.
CLV_PCSK_SKI1_1 [RK].[AILMFV][LTKF].
CLV_Separin_Fungi S[IVLMH]E[IVPFMLYAQR]GR.
CLV_Separin_Metazoa E[IMPVL][MLVP]R.
CLV_TASPASE1 Q[MLVI]DG..[DE]
DEG_APCC_DBOX_1 .R..L..[LIVM].
DEG_APCC_KENBOX_2 .KEN.
DEG_APCC_TPR_1 .[ILM]R$
DEG_COP1 [DE][DE]...VP[DE]
DEG_CRL4_CDT2_1 [NQ]{0,1}..[ILMV][ST][DEN][FY][FY].{2,3}[KR]{2,3}[^DE]
DEG_CRL4_CDT2_2 [NQ]{0,1}..[ILMV]T[DEN][HMFY][FMY].{2,3}[KR]{2,3}[^DE]
DEG_MDM2_1 F...W..[LIV]
DEG_Nend_UBRbox_4 ^M{0,1}(C).
DEG_ODPH_VHL_1 [IL]A(P).{6,8}[FLIVM].[FLIVM]
DEG_SCF_COI1_1 ..[RK][RK].SL..F[FLM].[RK]R[HRK].[RK].
DEG_SCF_FBW7_1 [LIVMP].{0,2}(T)P..([ST])
DEG_SCF_FBW7_2 [LIVMP].{0,2}(T)P..E
DEG_SCF_SKP2-CKS1_1 ..[DE].(T)P.K
DEG_SCF_TIR1_1 .[VLIA][VLI]GWPP[VLI]...R.
DEG_SCF_TRCP1_1 D(S)G.{2,3}([ST])
DEG_SIAH_1 .P.A.V.P[^P]
DOC_AGCK_PIF_1 F..[FWY][ST][FY]
DOC_AGCK_PIF_2 F..[FWY][DE][FY]
DOC_AGCK_PIF_3 F..F$
DOC_ANK_TNKS_1 .R..[PGAV][DEIP]G.
DOC_CKS1_1 [MPVLIFWYQ].(T)P..
DOC_CYCLIN_1 [RK].L.{0,1}[FYLIVMP]
DOC_MAPK_1 [KR]{0,2}[KR].{0,2}[KR].{2,4}[ILVM].[ILVF]
DOC_MAPK_2 F.FP
DOC_PIKK_1 [DEN][DEN].{2,3}[ILMVA][DEN][DEN]L
DOC_PP1_RVXF_1 ..[RK].{0,1}[VIL][^P][FW].
DOC_PP1_SILK_1 .[GS]IL[KR][^DE]
DOC_PP2B_1 .P[^P]I[^P][IV][^P]
DOC_PP2B_2 L.[LIVAPM]P
DOC_SPAK_OSR1_1 RF[^P][IV].
DOC_USP7_1 [PA][^P][^FYWIL]S[^P]
DOC_USP7_2 P.E[^P].S[^P]
DOC_WD40_RPTOR_TOS_1 F[EDQS][MILV][ED][MILV]((.{0,1}[ED])|($))
DOC_WW_Pin1_4 ...([ST])P.
LIG_14-3-3_1 R.[^P]([ST])[^P]P
LIG_14-3-3_2 R..[^P]([ST])[IVLM].
LIG_14-3-3_3 [RHK][STALV].([ST]).[PESRDIFTQ]
LIG_Actin_RPEL_3 [IL]..[^P][^P][^P][^P]R.....[IL]..[^P][^P][ILV][ILM]
LIG_Actin_WH2_1 R..[ILVMF][ILMVF][^P][^P][ILVM].{4,7}L(([KR].)|(NK))[VATI]
LIG_Actin_WH2_2 [^R]..((.[ILMVF])|([ILMVF].))[^P][^P][ILVM].{4,7}L(([KR].)|(NK))[VATIGS]
LIG_AP2alpha_1 F.D.F
LIG_AP2alpha_2 DP[FW]
LIG_APCC_Cbox_1 [DE]R[YFH][ILFVM][PAG].R
LIG_APCC_Cbox_2 DR[YFH][ILFVM][PA]..
LIG_AP_GAE_1 [DE][DES][DEGAS]F[SGAD][DEAP][LVIMFD]
LIG_BIR_III_1 ^M{0,1}A.P.
LIG_BIR_III_2 DA.P.
LIG_BIR_III_3 ^M{0,1}A.[AP].
LIG_BIR_III_4 DA.G.
LIG_BRCT_BRCA1_1 .(S)..F
LIG_BRCT_BRCA1_2 .(S)..F.K
LIG_BRCT_MDC1_1 .(S)..Y$
LIG_CaMK_CASK_1 ((SP)|([ED].{0,1}))[IV]W[IVL].R
LIG_CAP-Gly_1 [ED].{0,2}[ED].{0,2}[EDQ].{0,1}[YF]$
LIG_CAP-Gly_2 .W[RK][DE]GCY$
LIG_Clathr_ClatBox_1 L[IVLMF].[IVLMF][DE]
LIG_Clathr_ClatBox_2 .[NP]W[DES].W
LIG_CORNRBOX L[^P]{2,2}[HI]I[^P]{2,2}[IAV][IL]
LIG_CtBP_PxDLS_1 (P[LVIPME][DENS][LM][VASTRG])|(G[LVIPME][DENS][LM][VASTRG]((K)|(.[KR])))
LIG_Dynein_DLC8_1 [^P].[KR].TQT
LIG_EABR_CEP55_1 .A.GPP.{2,3}Y.
LIG_EF_ALG2_ABM_1 P[PG]{0,1}YP.{1,6}Y[QS]{0,1}P
LIG_EF_ALG2_ABM_2 P.P.{0,1}GF
LIG_EH_1 .NPF.
LIG_EH1_1 .[FYH].[IVM][^WFYP][^WFYP][ILM][ILMV].
LIG_eIF4E_1 Y....L[VILMF]
LIG_eIF4E_2 Y.PP.[ILMV]R
LIG_EVH1_1 ([FYWL]P.PP)|([FYWL]PP[ALIVTFY]P)
LIG_EVH1_2 PP..F
LIG_EVH1_3 [FY].[FW].....[LMVIF]P.P[DE]
LIG_FAT_LD_1 [LV][DE][^P][LM][LM][^P][^P]L[^P]
LIG_FHA_1 ..(T)..[ILV].
LIG_FHA_2 ..(T)..[DE].
LIG_GLEBS_BUB3_1 [EN][FYLW][NSQ].EE[ILMVF][^P][LIVMFA]
LIG_GYF [QHR].{0,1}P[PL]PP[GS]H[RH]
LIG_HCF-1_HBM_1 [DE]H.Y
LIG_HOMEOBOX [FY][DEP]WM
LIG_HP1_1 P[MVLIRWY]V[MVLIAS][LM]
LIG_Integrin_isoDGR_1 NGR
LIG_IQ ...[SACLIVTM]..[ILVMFCT]Q.{3,3}[RK].{4,5}[RKQ]..
LIG_KEPE_1 [VILMFT]K.EP.[DE]
LIG_KEPE_2 [VILMFT]K.EP.{2,3}[DE]
LIG_KEPE_3 [VILMFT]K.EP....[DE]
LIG_LIR_Apic_2 [EDST].{0,2}[WFY]..P
LIG_LIR_Gen_1 [EDST].{0,2}[WFY]..[ILV]
LIG_LIR_LC3C_4 [EDST].{0,2}LVV
LIG_LIR_Nem_3 [EDST].{0,2}[WFY]..[ILVFY]
LIG_LYPXL_L_2 [LM]YP...[LI][^P][^P][LI]
LIG_LYPXL_S_1 [LM]YP.[LI]
LIG_MAD2 [KR][IV][LV].....P
LIG_MYND_1 P.L.P
LIG_MYND_2 PP.LI
LIG_MYND_3 [LMV]P.LE
LIG_NBox_RRM_1 F..A[ILV]..A..[ILV]
LIG_NRBOX [^P]L[^P][^P]LL[^P]
LIG_OCRL_FandH_1 .F[^P][^P][KRIL]H[^P][^P][YLMFH][^P]...
LIG_PAM2_1 ..[LFP][NS][PIVTAFL].A..(([FY].[PYLF])|(W..)).
LIG_PAM2_2 ((WPP)|([FL][PV][APQ]))EF.PG.PWKG.
*LIG_PCNA_PIPBox_1 ((^.{0,3})|(Q)).[^FHWY][ILM][^P][^FHILVWYP][HFM][FMY]..
LIG_PDZ_Class_1 ...[ST].[ACVILF]$
LIG_PDZ_Class_2 ...[VLIFY].[ACVILF]$
LIG_PDZ_Class_3 ...[DE].[ACVILF]$
LIG_PTAP_UEV_1 .P[TS]AP.
LIG_PTB_Apo_2 (.[^P].NP.[FY].)|(.[ILVMFY].N..[FY].)
LIG_PTB_Phospho_1 (.[^P].NP.(Y))|(.[ILVMFY].N..(Y))
LIG_Rb_LxCxE_1 [LI].C.[DE]
LIG_Rb_pABgroove_1 ..[LIMV]..[LM][FY]D.
LIG_RGD RGD
LIG_RRM_PRI_1 .[ILVM]LG..P.
LIG_SH2_GRB2 (Y).N.
LIG_SH2_PTP2 (Y)[IV].[VILP]
LIG_SH2_SRC (Y)[QDEVAIL][DENPYHI][IPVGAHS]
LIG_SH2_STAT3 (Y)..Q
LIG_SH2_STAT5 (Y)[VLTFIC]..
LIG_SH2_STAT6 G(Y)[KQ].F
LIG_SH3_1 [RKY]..P..P
LIG_SH3_2 P..P.[KR]
LIG_SH3_3 ...[PV]..P
LIG_SH3_4 KP..[QK]...
LIG_SH3_5 P..DY
LIG_Sin3_1 [LIV]..[LM]L.AA.[FY][LI]
LIG_Sin3_2 [FHYM].A[AV].[VAC]L[MV].[MI]
LIG_Sin3_3 [FA].[LA][LV][LVI]..[AM]
LIG_SPRY_1 [ED][LIV]NNN[^P]
LIG_SUFU_1 [SV][CY]GH[LIF][LAST][GAIV].
LIG_SUMO_SBM_1 [ILV](.[ILV]|[ILV]|[ILV].)[ILV][STDE]{1,10}
LIG_SUMO_SBM_2 [STDE]{1,10}[ILV](.[ILV]|[ILV]|[ILV].)[ILV]
LIG_SxIP_EBH_1 ([KR][^ED]{0,5}[ST].IP[^ED]{5,5})|([^ED]{5,5}[ST].IP[^ED]{0,5}[KR])
LIG_TPR EEVD$
LIG_TRAF2_1 [PSAT].[QE]E
LIG_TRAF2_2 P.Q..D
LIG_TRAF6 ..P.E..[FYWHDE].
LIG_TRFH_1 [FY].L.P
LIG_TYR_ITAM [DEN]..(Y)..[LI].{6,12}(Y)..[LI]
LIG_TYR_ITIM [ILV].(Y)..[ILV]
LIG_TYR_ITSM ..T.(Y)..[IV]
LIG_ULM_U2AF65_1 [KR]{1,4}[KR].[KR]W.
LIG_WD40_WDR5_1 [ED].{0,3}[VI]D[VI]
LIG_WD40_WDR5_2 [EDSTY].{0,4}[VIPLA][TSDEKR][ILVA]
LIG_WD40_WDR5_WIN_1 [HN].[HNST]G[SCA]AR[STAC][EQ][GPVILM][YFHKRQN][YHLIVMATS]
LIG_WD40_WDR5_WIN_2 [HNCSVI]..[GDE][STCA][AGVS]R[STCA][EQR][GPLAV]
LIG_WD40_WDR5_WIN_3 [HNSTE].[TSQN]P{0,1}GS{0,1}[SCA][AFWH][KR][TAS][DEQ][GP][RKYFIVAMW]..[IVM]
LIG_WH1 ES[RK][FY].F[HR][PST][IVLM][DES][DE]
LIG_WRPW_1 [WFY]RP[WFY].{0,7}$
LIG_WRPW_2 [WFY][KR]P[WFY]
LIG_WW_1 PP.Y
LIG_WW_2 PPLP
LIG_WW_3 .PPR.
MOD_ASX_betaOH_EGF C.([DN]).{4,4}[FY].C.C
MOD_CAAXbox (C)[^DENQ][LIVM].$
MOD_CDK_1 ...([ST])P.[KR]
MOD_CK1_1 S..([ST])...
MOD_CK2_1 ...([ST])..E
MOD_CMANNOS (W)..W
MOD_GlcNHglycan [ED]{0,3}.(S)[GA].
MOD_GSK3_1 ...([ST])...[ST]
MOD_LATS_1 H.[KR]..([ST])[^P]
MOD_NEK2_1 [FLM][^P][^P]([ST])[^DEP][^DE]
MOD_N-GLC_1 .(N)[^P][ST]..
MOD_N-GLC_2 (N)[^P]C
MOD_NMyristoyl ^M{0,1}(G)[^EDRKHPFYW]..[STAGCN][^P]
MOD_OFUCOSY C.{3,5}([ST])C
MOD_OGLYCOS C.(S).PC
MOD_PIKK_1 ...([ST])Q..
MOD_PK_1 [RK]..(S)[VI]..
MOD_PKA_1 [RK][RK].([ST])[^P]..
MOD_PKA_2 .R.([ST])[^P]..
MOD_PKB_1 R.R..([ST])[^P]..
MOD_PLK .[DE].([ST])[ILFWMVA]..
MOD_ProDKin_1 ...([ST])P..
MOD_SPalmitoyl_2 G(C)M[GS][CL][KP]C
MOD_SPalmitoyl_4 ^M{0,1}G(C)..S[AKS]
MOD_SUMO [VILMAFP](K).E
MOD_TYR_CSK [TAD][EA].Q(Y)[QE].[GQA][PEDLS]
MOD_TYR_DYR ..[RKTC][IVL]Y[TQHS](Y)[IL]QSR
MOD_WntLipid [ETA](C)[QERK]..F...RWNC[ST]
TRG_AP2beta_CARGO_1 [DE].{1,2}F[^P][^P][FL][^P][^P][^P]R
TRG_Cilium_Arf4_1 QV.P.$
TRG_Cilium_RVxP_2 RV.P.
TRG_ENDOCYTIC_2 Y..[LMVIF]
TRG_ER_diArg_1 ([LIVMFYWPR]R[^YFWDE]{0,1}R)|(R[^YFWDE]{0,1}R[LIVMFYWPR])
TRG_ER_diLys_1 K.{0,1}K.{2,3}$
TRG_ER_FFAT_1 [DE].{0,4}E[FY][FYK]D[AC].[ESTD]
TRG_ER_KDEL_1 [KRHQSAP][DENQT]EL$
TRG_Golgi_diPhe_1 Q.{6,6}FF.{6,7}$
TRG_LysEnd_APsAcLL_1 [DERQ]...L[LVI]
TRG_LysEnd_APsAcLL_3 [DET]E[RK].PL[LI]
TRG_LysEnd_GGAAcLL_1 D..LL.{1,2}$
TRG_LysEnd_GGAAcLL_2 S[LW]LD[DE]EL[LM]
TRG_NES_CRM1_1 ([DEQ].{0,1}[LIM].{2,3}[LIVMF][^P]{2,3}[LMVF].[LMIV].{0,3}[DE])|([DE].{0,1}[LIM].{2,3}[LIVMF][^P]{2,3}[LMVF].[LMIV].{0,3}[DEQ])
TRG_NLS_Bipartite_1 [KR][KR].{7,15}[^DE]((K[RK])|(RK))(([^DE][KR])|([KR][^DE]))[^DE]
TRG_NLS_MonoCore_2 [^DE]((K[RK])|(RK))[KRP][KR][^DE]
TRG_NLS_MonoExtC_3 [^DE]((K[RK])|(RK))(([^DE][KR])|([KR][^DE]))(([PKR])|([^DE][DE]))
TRG_NLS_MonoExtN_4 (([PKR].{0,1}[^DE])|([PKR]))((K[RK])|(RK))(([^DE][KR])|([KR][^DE]))[^DE]
TRG_PEX_1 W...[FY]
TRG_PEX_2 F...[WF]
TRG_PEX_3 L..LL...L..F
TRG_PTS1 (.[SAPTC][KRH][LMFI]$)|([KRH][SAPTC][NTS][LMFI]$)
*TRG_PTS2 ^.{1,40}R[^P][^P][^P][LIV][^P][^P][HQ][LIF]
This diff is collapsed.
CLV_PCSK_KEX2_1
CLV_PCSK_PC1ET2_1
CLV_PCSK_PC7_1
CLV_PCSK_SKI1_1
DEG_CRL4_CDT2_2
DEG_SCF_FBW7_2
DOC_AGCK_PIF_2
DOC_AGCK_PIF_3
LIG_14-3-3_2
LIG_14-3-3_3
LIG_Actin_WH2_1
LIG_APCC_Cbox_1
LIG_BIR_III_1
LIG_BIR_III_2
LIG_BIR_III_4
LIG_BRCT_BRCA1_2
LIG_BRCT_MDC1_1
LIG_EF_ALG2_ABM_2
LIG_eIF4E_2
LIG_KEPE_1
LIG_KEPE_3
LIG_LIR_Apic_2
LIG_LIR_LC3C_4
LIG_LIR_Nem_3
LIG_LYPXL_L_2
LIG_PTB_Phospho_1
LIG_WD40_WDR5_2
LIG_WD40_WDR5_WIN_2
LIG_WD40_WDR5_WIN_3
LIG_WRPW_2
MOD_PKA_2
TRG_Cilium_Arf4_1
TRG_LysEnd_APsAcLL_3
TRG_NLS_Bipartite_1
TRG_NLS_MonoExtC_3
TRG_NLS_MonoExtN_4
#Compiler and Linker
CC := g++
#The Target Binary Program
TARGET := main
#The Directories, Source, Includes, Objects, Binary and Resources
SRCDIR := src
INCDIR := inc
BUILDDIR := obj
TARGETDIR := bin
RESDIR := res
SRCEXT := cpp
DEPEXT := d
OBJEXT := o
#Flags, Libraries and Includes
CFLAGS := -fopenmp -Wall -O3 -g
LIB := -fopenmp -lm
INC := -I$(INCDIR) -I/usr/local/include
INCDEP := -I$(INCDIR)
#---------------------------------------------------------------------------------
#DO NOT EDIT BELOW THIS LINE
#---------------------------------------------------------------------------------
SOURCES := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT))
OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.$(OBJEXT)))
#Defauilt Make
all: resources $(TARGET)
#Remake
remake: cleaner all
#Make the Directories
directories:
@mkdir -p $(TARGETDIR)
@mkdir -p $(BUILDDIR)
#Clean only Objecst
clean:
@$(RM) -rf $(BUILDDIR)
#Full Clean, Objects and Binaries
cleaner: clean
@$(RM) -rf $(TARGETDIR)
#Pull in dependency info for *existing* .o files
-include $(OBJECTS:.$(OBJEXT)=.$(DEPEXT))
#Link
$(TARGET): $(OBJECTS)
$(CC) -o $(TARGETDIR)/$(TARGET) $^ $(LIB)
#Compile
$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(SRCEXT)
@mkdir -p $(dir $@)
$(CC) $(CFLAGS) $(INC) -c -o $@ $<
@$(CC) $(CFLAGS) $(INCDEP) -MM $(SRCDIR)/$*.$(SRCEXT) > $(BUILDDIR)/$*.$(DEPEXT)
@cp -f $(BUILDDIR)/$*.$(DEPEXT) $(BUILDDIR)/$*.$(DEPEXT).tmp
@sed -e 's|.*:|$(BUILDDIR)/$*.$(OBJEXT):|' < $(BUILDDIR)/$*.$(DEPEXT).tmp > $(BUILDDIR)/$*.$(DEPEXT)
@sed -e 's/.*://' -e 's/\\$$//' < $(BUILDDIR)/$*.$(DEPEXT).tmp | fmt -1 | sed -e 's/^ *//' -e 's/$$/:/' >> $(BUILDDIR)/$*.$(DEPEXT)
@rm -f $(BUILDDIR)/$*.$(DEPEXT).tmp
#Non-File Targets
.PHONY: all remake clean cleaner resources
#ifndef _DATABASE_
#define _DATABASE_
#include "Header.hpp"
#include "Motif_class.hpp"
class Database
{
private:
std::string all = "VLTKWHFIRMAPGSCNQYDE";
std::string marked_motifs_filename = "";
std::string complement(std::string);
std::vector<Motif_class> motifs_eliminated;
bool clean_ends;
bool clean_marked;
/*
{a,b} always choose a, and for | always take the shortest one.
*/
std::vector<std::string> proc(std::string);
/*
INPUT: databse location
OUTPUT: vector of pairs with first coordinate the name of the linear motif, second coordinate regex.
*/
void load_motif_database(std::string);
/*
Eliminate end characters with more than 10 posibilities. If the list is empty we eliminate it while reporting which are eliminated.
*/
void clean_end_characters(void);
std::vector<Motif_class> motif_database;
void clean_marked_motifs();
public:
Database();
/*
input: string location of file, bool true if modif
*/
Database(std::string , bool, std::string);
std::vector<Motif_class> get_motif_database();
};
#endif
#ifndef _HEADER_
#define _HEADER_
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <sstream>
#include <deque>
#include <algorithm>
#include <stdio.h>
#include <set>
#include <map>
#define forn(i,n) for(int i=0;i<(int)n;i++)
#define debug(k) std::cout <<#k<<":"<<k<<std::endl
template<typename T>
std::ostream& operator<< (std::ostream& out, const std::vector<T>& v) {
std::size_t last = v.size() - 1;
for(std::size_t i = 0; i < v.size(); ++i) {
out << v[i];
if (i != last)
out << ",";
}
return out;
}
#endif
#ifndef _MDP_TOOLBOX_
#define _MDP_TOOLBOX_
#include "Database.hpp"
#include "Motif_pair.hpp"
class Mdp_toolbox
{
private:
//motif_discriminant_position[motif_1,motif_2] = motif_discriminant_position(motif_1, motif_2)
std::map<Motif_pair, int > mdp_database;
/*
INPUT: string a, string b
OUTPUT: number of letter that a and b have in common.
Assumptions: a neither b have repetead letter, and if any of them has more than 10 letters then we treat it as it had all 20.
*/
int intersection(std::string, std::string);
int motif_discriminating_positions(Motif_pair);
public:
Mdp_toolbox();
Mdp_toolbox(std::string, bool, bool);
std::vector<Motif_pair> get_pairs_k_discriminant_positions(int);
int get_mdp(Motif_pair);
std::vector<Motif_class> get_k_discriminant_motifs(Motif_class, int);
std::map<Motif_pair, int> get_mdp_database();
Database database;
int alignval(Motif_class, Motif_class);
/*
INPUT: two vectors of string a and b of same length.
OUTPUT: number of coordinates i such that a[i] and b[i] intersect.
*/
int simple_motif_discriminating_positions(Motif_class, Motif_class);
};
#endif
#ifndef _MOTIF_CLASS_
#define _MOTIF_CLASS_
#include "Header.hpp"
class Motif_class{
private:
//name of the motif class
std::string name;
//sequence[i] = posibilities of aminoacids for the i-th coordinate
std::vector<std::string> sequence;
std::vector<int> structure;
public:
Motif_class();
Motif_class(std::string, std::vector<std::string>);
std::string get_name() const;
std::vector<std::string> get_sequence() const;
std::vector<int> get_structure();
bool operator <(const Motif_class &) const;
bool operator !=(const Motif_class &) const;
bool operator ==(const Motif_class &) const;
};
#endif
#ifndef _MOTIF_PAIR_
#define _MOTIF_PAIR_
#include "Motif_class.hpp"
//the reason of Motif_pair is to always build <short_motif, long_motif> pairs, so avoiding counting twice
class Motif_pair{
Motif_class short_motif, long_motif;
public:
Motif_pair(Motif_class, Motif_class);
Motif_class get_short_motif() const;
Motif_class get_long_motif() const;
bool operator< (const Motif_pair &) const;
};
#endif
#ifndef _REPORT_MANAGER_
#define _REPORT_MANAGER_
#include "Mdp_toolbox.hpp"
#include "Statistical_toolbox.hpp"
class Report_manager{
Mdp_toolbox mdp_toolbox;
Statistical_toolbox statistical_toolbox;
std::string separator = "\t";
public:
//first argument is database location, the second argument is if we want to remove the endpoint that have more than 10 characters
Report_manager(std::string, bool, std::string);
//mdp_toolbox reports
std::stringstream number_of_aminoacids_per_coordinate(void);
std::stringstream aminoacids_per_coordinate(void);
std::stringstream number_k_discriminant_motifs(int k);
std::stringstream k_discriminant_motifs_pair(int k);
std::stringstream motif_discriminant_positions_alignval(void);
std::stringstream number_pairs_motifs_with_k_discriminant_positions(void);
//statistical_toolbox reports, necessary for python scripts
std::stringstream empiric_motif_probabilities();
std::stringstream all_lenghts_accumulated_probabilities();
};
#endif
#ifndef _STATISTICAL_TOOLBOX_
#define _STATISTICAL_TOOLBOX_
#include "Header.hpp"
#include "Mdp_toolbox.hpp"
class Statistical_toolbox{
private:
Mdp_toolbox mdp_toolbox;
int aminoacid_choices_upper_bound = 20;
std::map<int, int> number_sequences_by_length;
std::map<int, std::map<int, int> > number_aminoacids_by_length;
std::map<int, int>number_total_aminoacids_by_length;
int total_number_sequences;
int length_upper_bound;
std::map<std::vector<int>, float> empiric_motif_probabilities;
std::map<std::vector<int>, int> empiric_motif_number;
int total_empiric_motif_number;
std::map<std::vector<int>, float> theoretic_motif_probabilities;
void count_sequences();
void compute_probability();
void compute_empiric_motif_numbers();
float compute_empiric_based_probability(std::vector<int> &);
void compute_empiric_motif_probabilities();
void compute_theoretic_motif_probabilities();
void iteration(std::vector<int> &);
unsigned long long stat_fact(int k){unsigned long long t=1;forn(i,k)t*=i+1;return t;}
public:
Statistical_toolbox();
Statistical_toolbox(Mdp_toolbox);
std::map<std::vector<int>, float> get_empiric_motif_probabilities();
std::map<std::vector<int>, float> get_theoretic_motif_probabilities();
};
#endif
#include "Database.hpp"
Database::Database(){}
Database::Database(std::string database_filename, bool clean_ends, std::string marked_motifs_filename)
{
load_motif_database(database_filename);