@article {701, title = {CSVS, a crowdsourcing database of the Spanish population genetic variability.}, journal = {Nucleic Acids Res}, volume = {49}, year = {2021}, month = {2021 01 08}, pages = {D1130-D1137}, abstract = {

The knowledge of the genetic variability of the local population is of utmost importance in personalized medicine and has been revealed as a critical factor for the discovery of new disease variants. Here, we present the Collaborative Spanish Variability Server (CSVS), which currently contains more than 2000 genomes and exomes of unrelated Spanish individuals. This database has been generated in a collaborative crowdsourcing effort collecting sequencing data produced by local genomic projects and for other purposes. Sequences have been grouped by ICD10 upper categories. A web interface allows querying the database removing one or more ICD10 categories. In this way, aggregated counts of allele frequencies of the pseudo-control Spanish population can be obtained for diseases belonging to the category removed. Interestingly, in addition to pseudo-control studies, some population studies can be made, as, for example, prevalence of pharmacogenomic variants, etc. In addition, this genomic data has been used to define the first Spanish Genome Reference Panel (SGRP1.0) for imputation. This is the first local repository of variability entirely produced by a crowdsourcing effort and constitutes an example for future initiatives to characterize local variability worldwide. CSVS is also part of the GA4GH Beacon network. CSVS can be accessed at: http://csvs.babelomics.org/.

}, keywords = {Alleles, Chromosome Mapping, Crowdsourcing, Databases, Genetic, Exome, Gene Frequency, Genetic Variation, Genetics, Population, Genome, Human, Genomics, Humans, Internet, Precision Medicine, Software, Spain}, issn = {1362-4962}, doi = {10.1093/nar/gkaa794}, author = {Pe{\~n}a-Chilet, Maria and Rold{\'a}n, Gema and Perez-Florido, Javier and Ortuno, Francisco M and Carmona, Rosario and Aquino, Virginia and L{\'o}pez-L{\'o}pez, Daniel and Loucera, Carlos and Fernandez-Rueda, Jose L and Gallego, Asunci{\'o}n and Garcia-Garcia, Francisco and Gonz{\'a}lez-Neira, Anna and Pita, Guillermo and N{\'u}{\~n}ez-Torres, Roc{\'\i}o and Santoyo-L{\'o}pez, Javier and Ayuso, Carmen and Minguez, Pablo and Avila-Fernandez, Almudena and Corton, Marta and Moreno-Pelayo, Miguel {\'A}ngel and Morin, Mat{\'\i}as and Gallego-Martinez, Alvaro and Lopez-Escamez, Jose A and Borrego, Salud and Anti{\v n}olo, Guillermo and Amigo, Jorge and Salgado-Garrido, Josefa and Pasalodos-Sanchez, Sara and Morte, Beatriz and Carracedo, {\'A}ngel and Alonso, {\'A}ngel and Dopazo, Joaquin} } @article {387, title = {HGVA: the Human Genome Variation Archive.}, journal = {Nucleic Acids Res}, volume = {45}, year = {2017}, month = {2017 07 03}, pages = {W189-W194}, abstract = {

High-profile genomic variation projects like the 1000 Genomes project or the Exome Aggregation Consortium, are generating a wealth of human genomic variation knowledge which can be used as an essential reference for identifying disease-causing genotypes. However, accessing these data, contrasting the various studies and integrating those data in downstream analyses remains cumbersome. The Human Genome Variation Archive (HGVA) tackles these challenges and facilitates access to genomic data for key reference projects in a clean, fast and integrated fashion. HGVA provides an efficient and intuitive web-interface for easy data mining, a comprehensive RESTful API and client libraries in Python, Java and JavaScript for fast programmatic access to its knowledge base. HGVA calculates population frequencies for these projects and enriches their data with variant annotation provided by CellBase, a rich and fast annotation solution. HGVA serves as a proof-of-concept of the genome analysis developments being carried out by the University of Cambridge together with UK{\textquoteright}s 100 000 genomes project and the National Institute for Health Research BioResource Rare-Diseases, in particular, deploying open-source for Computational Biology (OpenCB) software platform for storing and analyzing massive genomic datasets.

}, keywords = {Genetic Variation, Genome, Human, Humans, Internet, Software, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gkx445}, url = {https://academic.oup.com/nar/article-lookup/doi/10.1093/nar/gkx445}, author = {Lopez, Javier and Coll, Jacobo and Haimel, Matthias and Kandasamy, Swaathi and T{\'a}rraga, Joaqu{\'\i}n and Furio-Tari, Pedro and Bari, Wasim and Bleda, Marta and Rueda, Antonio and Gr{\"a}f, Stefan and Rendon, Augusto and Dopazo, Joaquin and Medina, Ignacio} } @article {388, title = {Reference genome assessment from a population scale perspective: an accurate profile of variability and noise.}, journal = {Bioinformatics}, volume = {33}, year = {2017}, month = {2017 Nov 15}, pages = {3511-3517}, abstract = {

Motivation: Current plant and animal genomic studies are often based on newly assembled genomes that have not been properly consolidated. In this scenario, misassembled regions can easily lead to false-positive findings. Despite quality control scores are included within genotyping protocols, they are usually employed to evaluate individual sample quality rather than reference sequence reliability. We propose a statistical model that combines quality control scores across samples in order to detect incongruent patterns at every genomic region. Our model is inherently robust since common artifact signals are expected to be shared between independent samples over misassembled regions of the genome.

Results: The reliability of our protocol has been extensively tested through different experiments and organisms with accurate results, improving state-of-the-art methods. Our analysis demonstrates synergistic relations between quality control scores and allelic variability estimators, that improve the detection of misassembled regions, and is able to find strong artifact signals even within the human reference assembly. Furthermore, we demonstrated how our model can be trained to properly rank the confidence of a set of candidate variants obtained from new independent samples.

Availability and implementation: This tool is freely available at http://gitlab.com/carbonell/ces.

Contact: jcarbonell.cipf@gmail.com or joaquin.dopazo@juntadeandalucia.es.

Supplementary information: Supplementary data are available at Bioinformatics online.

}, keywords = {Animals, Genetic Variation, Genome, Genomics, Genotype, Humans, Models, Statistical, Quality Control, Reproducibility of Results, Software}, issn = {1367-4811}, doi = {10.1093/bioinformatics/btx482}, url = {https://academic.oup.com/bioinformatics/article-lookup/doi/10.1093/bioinformatics/btx482}, author = {Carbonell-Caballero, Jos{\'e} and Amadoz, Alicia and Alonso, Roberto and Hidalgo, Marta R and Cubuk, Cankut and Conesa, David and L{\'o}pez-Qu{\'\i}lez, Antonio and Dopazo, Joaquin} } @article {558, title = {Whole exome sequencing of Rett syndrome-like patients reveals the mutational diversity of the clinical phenotype.}, journal = {Hum Genet}, volume = {135}, year = {2016}, month = {2016 12}, pages = {1343-1354}, abstract = {

Classical Rett syndrome (RTT) is a neurodevelopmental disorder where most of cases carry MECP2 mutations. Atypical RTT variants involve mutations in CDKL5 and FOXG1. However, a subset of RTT patients remains that do not carry any mutation in the described genes. Whole exome sequencing was carried out in a cohort of 21 female probands with clinical features overlapping with those of RTT, but without mutations in the customarily studied genes. Candidates were functionally validated by assessing the appearance of a neurological phenotype in Caenorhabditis elegans upon disruption of the corresponding ortholog gene. We detected pathogenic variants that accounted for the RTT-like phenotype in 14 (66.6~\%) patients. Five patients were carriers of mutations in genes already known to be associated with other syndromic neurodevelopmental disorders. We determined that the other patients harbored mutations in genes that have not previously been linked to RTT or other neurodevelopmental syndromes, such as the ankyrin repeat containing protein ANKRD31 or the neuronal acetylcholine receptor subunit alpha-5 (CHRNA5). Furthermore, worm assays demonstrated that mutations in the studied candidate genes caused locomotion defects. Our findings indicate that mutations in a variety of genes contribute to the development of RTT-like phenotypes.

}, keywords = {Adolescent, Adult, Animals, Caenorhabditis elegans, Carrier Proteins, Cell Cycle Proteins, Child, Child, Preschool, DNA Mutational Analysis, Exome, Female, Forkhead Transcription Factors, Genetic Variation, High-Throughput Nucleotide Sequencing, Humans, Methyl-CpG-Binding Protein 2, mutation, Nerve Tissue Proteins, Protein Serine-Threonine Kinases, Receptors, Nicotinic, Rett Syndrome}, issn = {1432-1203}, doi = {10.1007/s00439-016-1721-3}, author = {Lucariello, Mario and Vidal, Enrique and Vidal, Silvia and Saez, Mauricio and Roa, Laura and Huertas, Dori and Pineda, Merc{\`e} and Dalf{\'o}, Esther and Dopazo, Joaquin and Jurado, Paola and Armstrong, Judith and Esteller, Manel} } @article {494, title = {The role of the interactome in the maintenance of deleterious variability in human populations.}, journal = {Mol Syst Biol}, volume = {10}, year = {2014}, month = {2014 Sep 26}, pages = {752}, abstract = {

Recent genomic projects have revealed the existence of an unexpectedly large amount of deleterious variability in the human genome. Several hypotheses have been proposed to explain such an apparently high mutational load. However, the mechanisms by which deleterious mutations in some genes cause a pathological effect but are apparently innocuous in other genes remain largely unknown. This study searched for deleterious variants in the 1,000 genomes populations, as well as in a newly sequenced population of 252 healthy Spanish individuals. In addition, variants causative of monogenic diseases and somatic variants from 41 chronic lymphocytic leukaemia patients were analysed. The deleterious variants found were analysed in the context of the interactome to understand the role of network topology in the maintenance of the observed mutational load. Our results suggest that one of the mechanisms whereby the effect of these deleterious variants on the phenotype is suppressed could be related to the configuration of the protein interaction network. Most of the deleterious variants observed in healthy individuals are concentrated in peripheral regions of the interactome, in combinations that preserve their connectivity, and have a marginal effect on interactome integrity. On the contrary, likely pathogenic cancer somatic deleterious variants tend to occur in internal regions of the interactome, often with associated structural consequences. Finally, variants causative of monogenic diseases seem to occupy an intermediate position. Our observations suggest that the real pathological potential of a variant might be more a systems property rather than an intrinsic property of individual proteins.

}, keywords = {Alleles, Exome, Gene Library, Genetic Variation, Genetics, Population, Genome, Human, Genomics, Humans, Models, Genetic, mutation, Phenotype, Protein Conformation, Protein Interaction Maps, Sequence Analysis, DNA, Whites}, issn = {1744-4292}, doi = {10.15252/msb.20145222}, author = {Garc{\'\i}a-Alonso, Luz and Jim{\'e}nez-Almaz{\'a}n, Jorge and Carbonell-Caballero, Jos{\'e} and Vela-Boza, Alicia and Santoyo-L{\'o}pez, Javier and Anti{\v n}olo, Guillermo and Dopazo, Joaquin} } @article {513, title = {Diversification of the expanded teleost-specific toll-like receptor family in Atlantic cod, Gadus morhua.}, journal = {BMC Evol Biol}, volume = {12}, year = {2012}, month = {2012 Dec 29}, pages = {256}, abstract = {

BACKGROUND: Toll-like receptors (Tlrs) are major molecular pattern recognition receptors of the innate immune system. Atlantic cod (Gadus morhua) is the first vertebrate known to have lost most of the mammalian Tlr orthologues, particularly all bacterial recognising and other cell surface Tlrs. On the other hand, its genome encodes a unique repertoire of teleost-specific Tlrs. The aim of this study was to investigate if these duplicate Tlrs have been retained through adaptive evolution to compensate for the lack of other cell surface Tlrs in the cod genome.

RESULTS: In this study, one tlr21, 12 tlr22 and two tlr23 genes representing the teleost-specific Tlr family have been cloned and characterised in cod. Phylogenetic analysis grouped all tlr22 genes under a single clade, indicating that the multiple cod paralogues have arisen through lineage-specific duplications. All tlrs examined were transcribed in immune-related tissues as well as in stomach, gut and gonads of adult cod and were differentially expressed during early development. These tlrs were also differentially regulated following immune challenge by immersion with Vibrio anguillarum, indicating their role in the immune response. An increase in water temperature from 4 to 12{\textdegree}C was associated with a 5.5-fold down-regulation of tlr22d transcript levels in spleen. Maximum likelihood analysis with different evolution models revealed that tlr22 genes are under positive selection. A total of 24 codons were found to be positively selected, of which 19 are in the ligand binding region of ectodomain.

CONCLUSION: Positive selection pressure coupled with experimental evidence of differential expression strongly support the hypothesis that teleost-specific tlr paralogues in cod are undergoing neofunctionalisation and can recognise bacterial pathogen-associated molecular patterns to compensate for the lack of other cell surface Tlrs.

}, keywords = {Amino Acid Sequence, Animals, Binding Sites, Evolution, Molecular, Fish Diseases, Fish Proteins, Gadus morhua, Gene Expression Profiling, Genetic Variation, Gills, Head Kidney, Host-Pathogen Interactions, Models, Molecular, Molecular Sequence Data, Multigene Family, Phylogeny, Protein Structure, Tertiary, Reverse Transcriptase Polymerase Chain Reaction, Selection, Genetic, Sequence Analysis, DNA, Sequence Homology, Amino Acid, Temperature, Toll-Like Receptors, Vibrio}, issn = {1471-2148}, doi = {10.1186/1471-2148-12-256}, author = {Sundaram, Arvind Y M and Kiron, Viswanath and Dopazo, Joaquin and Fernandes, Jorge M O} } @article {523, title = {VARIANT: Command Line, Web service and Web interface for fast and accurate functional characterization of variants found by Next-Generation Sequencing.}, journal = {Nucleic Acids Res}, volume = {40}, year = {2012}, month = {2012 Jul}, pages = {W54-8}, abstract = {

The massive use of Next-Generation Sequencing (NGS) technologies is uncovering an unexpected amount of variability. The functional characterization of such variability, particularly in the most common form of variation found, the Single Nucleotide Variants (SNVs), has become a priority that needs to be addressed in a systematic way. VARIANT (VARIant ANalyis Tool) reports information on the variants found that include consequence type and annotations taken from different databases and repositories (SNPs and variants from dbSNP and 1000 genomes, and disease-related variants from the Genome-Wide Association Study (GWAS) catalog, Online Mendelian Inheritance in Man (OMIM), Catalog of Somatic Mutations in Cancer (COSMIC) mutations, etc). VARIANT also produces a rich variety of annotations that include information on the regulatory (transcription factor or miRNA-binding sites, etc.) or structural roles, or on the selective pressures on the sites affected by the variation. This information allows extending the conventional reports beyond the coding regions and expands the knowledge on the contribution of non-coding or synonymous variants to the phenotype studied. Contrarily to other tools, VARIANT uses a remote database and operates through efficient RESTful Web Services that optimize search and transaction operations. In this way, local problems of installation, update or disk size limitations are overcome without the need of sacrifice speed (thousands of variants are processed per minute). VARIANT is available at: http://variant.bioinfo.cipf.es.

}, keywords = {Databases, Nucleic Acid, Genetic Variation, High-Throughput Nucleotide Sequencing, Internet, Molecular Sequence Annotation, mutation, Polymorphism, Single Nucleotide, Software, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gks572}, author = {Medina, Ignacio and De Maria, Alejandro and Bleda, Marta and Salavert, Francisco and Alonso, Roberto and Gonzalez, Cristina Y and Dopazo, Joaquin} } @article {536, title = {Mutation screening of multiple genes in Spanish patients with autosomal recessive retinitis pigmentosa by targeted resequencing.}, journal = {PLoS One}, volume = {6}, year = {2011}, month = {2011}, pages = {e27894}, abstract = {

Retinitis Pigmentosa (RP) is a heterogeneous group of inherited retinal dystrophies characterised ultimately by the loss of photoreceptor cells. RP is the leading cause of visual loss in individuals younger than 60 years, with a prevalence of about 1 in 4000. The molecular genetic diagnosis of autosomal recessive RP (arRP) is challenging due to the large genetic and clinical heterogeneity. Traditional methods for sequencing arRP genes are often laborious and not easily available and a screening technique that enables the rapid detection of the genetic cause would be very helpful in the clinical practice. The goal of this study was to develop and apply microarray-based resequencing technology capable of detecting both known and novel mutations on a single high-throughput platform. Hence, the coding regions and exon/intron boundaries of 16 arRP genes were resequenced using microarrays in 102 Spanish patients with clinical diagnosis of arRP. All the detected variations were confirmed by direct sequencing and potential pathogenicity was assessed by functional predictions and frequency in controls. For validation purposes 4 positive controls for variants consisting of previously identified changes were hybridized on the array. As a result of the screening, we detected 44 variants, of which 15 are very likely pathogenic detected in 14 arRP families (14\%). Finally, the design of this array can easily be transformed in an equivalent diagnostic system based on targeted enrichment followed by next generation sequencing.

}, keywords = {Alleles, DNA Mutational Analysis, Exons, Genetic Variation, Genome, Hispanic or Latino, Humans, Introns, Language, mutation, Mutation, Missense, Oligonucleotide Array Sequence Analysis, Polymerase Chain Reaction, Reproducibility of Results, Retinitis pigmentosa, United States}, issn = {1932-6203}, doi = {10.1371/journal.pone.0027894}, author = {Gonz{\'a}lez-del Pozo, Mar{\'\i}a and Borrego, Salud and Barrag{\'a}n, Isabel and Pieras, Juan I and Santoyo, Javier and Matamala, Nerea and Naranjo, Bel{\'e}n and Dopazo, Joaquin and Anti{\v n}olo, Guillermo} } @article {575, title = {Mutation spectrum of EYS in Spanish patients with autosomal recessive retinitis pigmentosa.}, journal = {Hum Mutat}, volume = {31}, year = {2010}, month = {2010 Nov}, pages = {E1772-800}, abstract = {

Retinitis pigmentosa (RP) is a heterogeneous group of inherited retinal dystrophies characterised ultimately by the loss of photoreceptor cells. We have recently identified a new gene(EYS) encoding an ortholog of Drosophila space maker (spam) as a commonly mutated gene in autosomal recessive RP. In the present study, we report the identification of 73 sequence variations in EYS, of which 28 are novel. Of these, 42.9\% (12/28) are very likely pathogenic, 17.9\% (5/28)are possibly pathogenic, whereas 39.3\% (11/28) are SNPs. In addition, we have detected 3 pathogenic changes previously reported in other populations. We are also presenting the characterisation of EYS homologues in different species, and a detailed analysis of the EYS domains, with the identification of an interesting novel feature: a putative coiled-coil domain.Majority of the mutations in the arRP patients have been found within the domain structures of EYS. The minimum observed prevalence of distinct EYS mutations in our group of patients is of 15.9\% (15/94), confirming a major involvement of EYS in the pathogenesis of arRP in the Spanish population. Along with the detection of three recurrent mutations in Caucasian population, our hypothesis of EYS being the first prevalent gene in arRP has been reinforced in the present study.

}, keywords = {Amino Acid Sequence, Animals, Case-Control Studies, DNA Mutational Analysis, Drosophila Proteins, Evolution, Molecular, Eye Proteins, Female, Genes, Recessive, Genetic Variation, Humans, Male, Molecular Sequence Data, mutation, Pedigree, Polymorphism, Single Nucleotide, Protein Structure, Tertiary, Retinitis pigmentosa, Spain, Structural Homology, Protein}, issn = {1098-1004}, doi = {10.1002/humu.21334}, author = {Barrag{\'a}n, Isabel and Borrego, Salud and Pieras, Juan Ignacio and Gonz{\'a}lez-del Pozo, Mar{\'\i}a and Santoyo, Javier and Ayuso, Carmen and Baiget, Montserrat and Mill{\'a}n, Jos{\'e} M and Mena, Marcela and Abd El-Aziz, Mai M and Audo, Isabelle and Zeitz, Christina and Littink, Karin W and Dopazo, Joaquin and Bhattacharya, Shomi S and Anti{\v n}olo, Guillermo} } @article {583, title = {Gene set-based analysis of polymorphisms: finding pathways or biological processes associated to traits in genome-wide association studies.}, journal = {Nucleic Acids Res}, volume = {37}, year = {2009}, month = {2009 Jul}, pages = {W340-4}, abstract = {

Genome-wide association studies have become a popular strategy to find associations of genes to traits of interest. Despite the high-resolution available today to carry out genotyping studies, the success of its application in real studies has been limited by the testing strategy used. As an alternative to brute force solutions involving the use of very large cohorts, we propose the use of the Gene Set Analysis (GSA), a different analysis strategy based on testing the association of modules of functionally related genes. We show here how the Gene Set-based Analysis of Polymorphisms (GeSBAP), which is a simple implementation of the GSA strategy for the analysis of genome-wide association studies, provides a significant increase in the power testing for this type of studies. GeSBAP is freely available at http://bioinfo.cipf.es/gesbap/.

}, keywords = {Biological Phenomena, Breast Neoplasms, Female, Genes, Genetic Variation, Genome-Wide Association Study, Humans, Polymorphism, Single Nucleotide, Software, User-Computer Interface}, issn = {1362-4962}, doi = {10.1093/nar/gkp481}, author = {Medina, Ignacio and Montaner, David and Bonifaci, N{\'u}ria and Pujana, Miguel Angel and Carbonell, Jos{\'e} and T{\'a}rraga, Joaqu{\'\i}n and Al-Shahrour, F{\'a}tima and Dopazo, Joaquin} } @article {600, title = {Use of estimated evolutionary strength at the codon level improves the prediction of disease-related protein mutations in humans.}, journal = {Hum Mutat}, volume = {29}, year = {2008}, month = {2008 Jan}, pages = {198-204}, abstract = {

Predicting the functional impact of protein variation is one of the most challenging problems in bioinformatics. A rapidly growing number of genome-scale studies provide large amounts of experimental data, allowing the application of rigorous statistical approaches for predicting whether a given single point mutation has an impact on human health. Up until now, existing methods have limited their source data to either protein or gene information. Novel in this work, we take advantage of both and focus on protein evolutionary information by using estimated selective pressures at the codon level. Here we introduce a new method (SeqProfCod) to predict the likelihood that a given protein variant is associated with human disease or not. Our method relies on a support vector machine (SVM) classifier trained using three sources of information: protein sequence, multiple protein sequence alignments, and the estimation of selective pressure at the codon level. SeqProfCod has been benchmarked with a large dataset of 8,987 single point mutations from 1,434 human proteins from SWISS-PROT. It achieves 82\% overall accuracy and a correlation coefficient of 0.59, indicating that the estimation of the selective pressure helps in predicting the functional impact of single-point mutations. Moreover, this study demonstrates the synergic effect of combining two sources of information for predicting the functional effects of protein variants: protein sequence/profile-based information and the evolutionary estimation of the selective pressures at the codon level. The results of large-scale application of SeqProfCod over all annotated point mutations in SWISS-PROT (available for download at http://sgu.bioinfo.cipf.es/services/Omidios/; last accessed: 24 August 2007), could be used to support clinical studies.

}, keywords = {Algorithms, Codon, Computational Biology, Databases, Protein, DNA Mutational Analysis, Evolution, Molecular, Genetic Predisposition to Disease, Genetic Variation, Genome, Human, Humans, Iduronic Acid, Point Mutation, Polymorphism, Single Nucleotide, Proteins, Tumor Suppressor Protein p53}, issn = {1098-1004}, doi = {10.1002/humu.20628}, author = {Capriotti, Emidio and Arbiza, Leonardo and Casadio, Rita and Dopazo, Joaquin and Dopazo, Hern{\'a}n and Marti-Renom, Marc A} }