{
    "@context": [
        "https://w3id.org/codemeta/3.0",
        "https://w3id.org/software-iodata",
        "https://w3id.org/nwo-research-fields",
        "https://raw.githubusercontent.com/jantman/repostatus.org/master/badges/latest/ontology.jsonld",
        "https://w3id.org/research-technology-readiness-levels",
        "https://schema.org",
        "https://w3id.org/software-types"
    ],
    "@id": "https://tools.clariah.nl/colibri-core/2.5.10",
    "@type": "SoftwareSourceCode",
    "author": [
        {
            "@id": "https://orcid.org/0000-0002-1046-0006",
            "@type": "Person",
            "affiliation": [
                {
                    "@id": "https://www.ru.nl/clst",
                    "@type": "Organization",
                    "name": "Centre for Language and Speech Technology",
                    "parentOrganization": {
                        "@id": "https://www.ru.nl/cls",
                        "@type": "Organization",
                        "name": "Centre for Language Studies",
                        "parentOrganization": {
                            "@id": "https://www.ru.nl",
                            "@type": "Organization",
                            "location": {
                                "@type": "Place",
                                "name": "Nijmegen"
                            },
                            "name": "Radboud University",
                            "url": "https://www.ru.nl"
                        },
                        "url": "https://www.ru.nl/cls"
                    },
                    "url": "https://www.ru.nl/clst"
                },
                {
                    "@id": "https://huc.knaw.nl",
                    "@type": "Organization",
                    "name": [
                        "Humanities Cluster",
                        "KNAW Humanities Cluster"
                    ],
                    "parentOrganization": {
                        "@id": "https://knaw.nl",
                        "@type": "Organization",
                        "location": {
                            "@type": "Place",
                            "name": "Amsterdam"
                        },
                        "name": "KNAW",
                        "url": "https://knaw.nl"
                    },
                    "url": "https://huc.knaw.nl"
                },
                {
                    "@id": "https://www.ru.nl/cls",
                    "@type": "Organization",
                    "name": "Centre for Language Studies",
                    "parentOrganization": {
                        "@id": "https://www.ru.nl",
                        "@type": "Organization",
                        "location": {
                            "@type": "Place",
                            "name": "Nijmegen"
                        },
                        "name": "Radboud University",
                        "url": "https://www.ru.nl"
                    },
                    "url": "https://www.ru.nl/cls"
                },
                {
                    "@id": "https://knaw.huc.nl"
                }
            ],
            "email": "proycon@anaproy.nl",
            "familyName": "van Gompel",
            "givenName": "Maarten",
            "position": 4,
            "url": "https://proycon.anaproy.nl"
        }
    ],
    "codeRepository": "https://github.com/proycon/colibri-core",
    "continuousIntegration": {
        "@id": "https://travis-ci.org/proycon/colibri-core"
    },
    "contributor": [
        {
            "@id": "https://orcid.org/0000-0002-1046-0006",
            "@type": "Person",
            "affiliation": [
                {
                    "@id": "https://www.ru.nl/clst",
                    "@type": "Organization",
                    "name": "Centre for Language and Speech Technology",
                    "parentOrganization": {
                        "@id": "https://www.ru.nl/cls",
                        "@type": "Organization",
                        "name": "Centre for Language Studies",
                        "parentOrganization": {
                            "@id": "https://www.ru.nl",
                            "@type": "Organization",
                            "location": {
                                "@type": "Place",
                                "name": "Nijmegen"
                            },
                            "name": "Radboud University",
                            "url": "https://www.ru.nl"
                        },
                        "url": "https://www.ru.nl/cls"
                    },
                    "url": "https://www.ru.nl/clst"
                },
                {
                    "@id": "https://huc.knaw.nl",
                    "@type": "Organization",
                    "name": [
                        "Humanities Cluster",
                        "KNAW Humanities Cluster"
                    ],
                    "parentOrganization": {
                        "@id": "https://knaw.nl",
                        "@type": "Organization",
                        "location": {
                            "@type": "Place",
                            "name": "Amsterdam"
                        },
                        "name": "KNAW",
                        "url": "https://knaw.nl"
                    },
                    "url": "https://huc.knaw.nl"
                },
                {
                    "@id": "https://www.ru.nl/cls",
                    "@type": "Organization",
                    "name": "Centre for Language Studies",
                    "parentOrganization": {
                        "@id": "https://www.ru.nl",
                        "@type": "Organization",
                        "location": {
                            "@type": "Place",
                            "name": "Nijmegen"
                        },
                        "name": "Radboud University",
                        "url": "https://www.ru.nl"
                    },
                    "url": "https://www.ru.nl/cls"
                },
                {
                    "@id": "https://knaw.huc.nl"
                }
            ],
            "email": "proycon@anaproy.nl",
            "familyName": "van Gompel",
            "givenName": "Maarten",
            "position": 4,
            "url": "https://proycon.anaproy.nl"
        }
    ],
    "dateCreated": "2013-09-15",
    "description": "Colibri core is an NLP tool as well as a C++ and Python library for working with basic linguistic constructions such as n-grams and skipgrams (i.e patterns with one or more gaps, either of fixed or dynamic size) in a quick and memory-efficient way. ",
    "developmentStatus": {
        "@id": "https://www.repostatus.org/#active",
        "@type": "skos:Concept",
        "og:image": "https://www.repostatus.org/badges/latest/active.svg",
        "skos:definition": "The project has reached a stable, usable state and is being actively developed.",
        "skos:inScheme": "https://www.repostatus.org",
        "skos:prefLabel": "Active"
    },
    "https://github.com/proycon/codemetapy/errors": 0,
    "https://github.com/proycon/codemetapy/log": "(log file starts at Thu Jun 18 03:04:42 UTC 2026)\n\n[harvester info] --> Processing colibri-core (https://github.com/proycon/colibri-core) [Thu Jun 18 03:04:42 UTC 2026]\n\n[harvester info] Git updating cached clone of https://github.com/proycon/colibri-core...\n\n[harvester info] Found release v2.5.10\n\n[harvester info] Using 'v2.5.10'\n\n[harvester info] Git reference: v2.5.10\n\n[harvester info] Scanning directory /tmp/codemeta-harvester.cache/colibri-core for harvestable resources...\n\n[harvester info] found codemeta.json for colibri-core (md5sum ca71007ea33ff9c770a30aaa93781c11); **NOTE: this is considered authoritative and most other detection methods will be skipped now!**\n\n[harvester info] Inferring repostatus information from git activity (used only as a fallback if not explicitly provided)...\n\n[harvester info] Inferred repostatus https://www.repostatus.org/#active\n\n[harvester info] Looking for repostatus information in README.md in master branch...\n\n[harvester info] Found repostatus (master branch) https://www.repostatus.org/#active\n\n[harvester info] Reconciliating: codemetapy  --baseuri https://tools.clariah.nl --baseuri https://tools.clariah.nl --includecontext --addcontext https://w3id.org/nwo-research-fields --addcontext https://w3id.org/research-technology-readiness-levels --addcontextgraph https://vocabs.dariah.eu/rest/v1/tadirah/data?format=text/turtle --trl --identifier \"colibri-core\" --codeRepository \"https://github.com/proycon/colibri-core\" --validate /etc/software.ttl --released --enrich --textv \"Please consult the CLARIAH Software Metadata Requirements at https://github.com/CLARIAH/clariah-plus/blob/main/requirements/software-metadata-requirements.md for an in-depth explanation of any found problems\" -O /tmp/out/colibri-core.codemeta.json /tmp/codemeta-harvester.cache//tmp/99-repostatus.colibri-core.codemeta.json /tmp/codemeta-harvester.cache//tmp/10-jsonld.colibri-core.codemeta.json /tmp/codemeta-harvester.cache//tmp/05-repostatus.colibri-core.codemeta.json \n\n-- begin log --\n\nPassed 3 files/sources but specified 0 input types! Automatically guessing types...\n\nDetected input types: [('/tmp/codemeta-harvester.cache//tmp/99-repostatus.colibri-core.codemeta.json', 'json'), ('/tmp/codemeta-harvester.cache//tmp/10-jsonld.colibri-core.codemeta.json', 'json'), ('/tmp/codemeta-harvester.cache//tmp/05-repostatus.colibri-core.codemeta.json', 'json')]\n\nAdding to contextgraph: /tmp/turtle\n\nInitial URI automatically generated, may be overriden later: https://tools.clariah.nl/colibri-core\n\nProcessing source #1 of 3\n\nParsing json-ld file from /tmp/codemeta-harvester.cache//tmp/99-repostatus.colibri-core.codemeta.json\n\n    NOTE: Not a valid JSON-LD document, @context missing! Attempting to inject automatically...\n\n    Injected (possibly temporary) URI https://tools.clariah.nl/colibri-core\n\n[CODEMETA COMPOSITION (https://tools.clariah.nl/colibri-core)] processed 1 new triples, total is now 2\n\nProcessing source #2 of 3\n\nParsing json-ld file from /tmp/codemeta-harvester.cache//tmp/10-jsonld.colibri-core.codemeta.json\n\n    Injected (possibly temporary) URI https://tools.clariah.nl/colibri-core\n\n[CODEMETA 2 TO 3] Updating contIntegration -> continuousIntegration\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA 2 TO 3] Updating targetProduct -> isSourceCodeOf\n\n[CODEMETA CORRECTION (colibricore)] automatically converting spdx license URI from https:// to http:///\n\n[CODEMETA COMPOSITION (colibricore)] processed 129 new triples, total is now 129\n\nProcessing source #3 of 3\n\nParsing json-ld file from /tmp/codemeta-harvester.cache//tmp/05-repostatus.colibri-core.codemeta.json\n\n    NOTE: Not a valid JSON-LD document, @context missing! Attempting to inject automatically...\n\n    Injected (possibly temporary) URI https://tools.clariah.nl/colibri-core\n\n[CODEMETA COMPOSITION (colibricore)] processed 1 new triples, total is now 129\n\nRemapping URI to (possibly) new identifier and version component: https://tools.clariah.nl/colibri-core -> https://tools.clariah.nl/colibri-core/2.5.10\n\n[CODEMETA VALIDATION (colibri-core)] done\n\n[CODEMETA ENRICHMENT (colibri-core)] adding author https://orcid.org/0000-0002-1046-0006 as contributor\n\n[CODEMETA ENRICHMENT (colibri-core)] considering first author as maintainer\n\n[CODEMETA ENRICHMENT (colibri-core)] adding affiliation(s) of first author as producer\n\nVALIDATION https://tools.clariah.nl/colibri-core/2.5.10 #1: Warning: Documentation *SHOULD* be expressed (The metadata does express this currently, but something is wrong in the way it is expressed. Is the type/class valid?)\n\nVALIDATION https://tools.clariah.nl/colibri-core/2.5.10 #2: Info: The funder *SHOULD* be acknowledged (This is missing in the metadata)\n\nVALIDATION https://tools.clariah.nl/colibri-core/2.5.10 #3: Info: The technology readiness level *SHOULD* be expressed (This is missing in the metadata)\n\nVALIDATION https://tools.clariah.nl/colibri-core/2.5.10 #4: Info: A research domain *SHOULD* be expressed as a category using the NWO Research Fields vocabulary, if applicable (This is missing in the metadata)\n\nVALIDATION https://tools.clariah.nl/colibri-core/2.5.10 #5: Info: A research activity *SHOULD* be expressed as a category using the TaDiRaH vocabulary (This is missing in the metadata)\n\n-- end log --\n\n[harvester info] Output written to /tmp/out/colibri-core.codemeta.json\n\n[harvester info] <-- Finished processing colibri-core (https://github.com/proycon/colibri-core) [Thu Jun 18 03:04:44 UTC 2026]\n",
    "identifier": "colibri-core",
    "isSourceCodeOf": [
        {
            "@type": "CommandLineApplication",
            "description": "Decodes a binary encoded corpus and a class file to a plain text corpus",
            "executableName": "colibri-classdecode",
            "name": "colibri-classdecode"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Encodes a plain text corpus to a binary encoded corpus and a class file",
            "executableName": "colibri-classencode",
            "name": "colibri-classencode"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Computes co-occurrence statistics (absolute co-cooccurrence or pointwise mutual information) between patterns in a corpus",
            "executableName": "colibri-cooc",
            "name": "colibri-cooc"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Computes the coverage of training/background corpus on a particular test/foreground corpus, i.e how many of the patterns in the test corpus were found during training, how many tokens are covered, and how is this all distributed?. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-coverage",
            "name": "colibri-coverage"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Find patterns in corpus data based on a presupplied list of patterns (one per line). This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-findpatterns",
            "name": "colibri-findpatterns"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Extract n-grams (and optionally skipgrams) with their counts from one or more plain-text corpus files. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-freqlist",
            "name": "colibri-freqlist"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Computes a histogram for ngram occurrences (and optionally skipgrams) in the corpus. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-histogram",
            "name": "colibri-histogram"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Compares the frequency of patterns between two or more corpus files (plain text) by computing log likelihood, following the methodology of Rayson and Garside (2000), Comparing corpora using frequency profiling. In proceedings of the workshop on Comparing Corpora, held in conjunction with the 38th annual meeting of the Association for Computational Linguistics (ACL 2000). 1-8 October 2000, Hong Kong, pp. 1 - 6: http://www.comp.lancs.ac.uk/~paul/publications/rg_acl2000.pdf. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-loglikelihood",
            "name": "colibri-loglikelihood"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Extract n-grams of a particular size by moving a sliding window over the corpus. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-ngrams",
            "name": "colibri-ngrams"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Computes a summary report on the count of ngrams (and optionally skipgrams) in the corpus. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-ngramstats",
            "name": "colibri-ngramstats"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Extract, model and compare recurring patterns (n-grams, skipgrams, flexgrams) and their frequencies in text corpus data. This is the main tool of Colibri Core.",
            "executableName": "colibri-patternmodeller",
            "name": "colibri-patternmodeller"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Interactive command line  tool to  n-grams with their counts from one or more plain-text corpus files. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-queryngrams",
            "name": "colibri-queryngrams"
        },
        {
            "@type": "CommandLineApplication",
            "description": "Computes and prints reverse index of the corpus, for each token position in the corpus, all patterns that start at that position are shown. This is a high-level convenience script over underlying tools.",
            "executableName": "colibri-reverseindex",
            "name": "colibri-reverseindex"
        }
    ],
    "issueTracker": "https://github.com/proycon/colibri-core/issues",
    "keywords": [
        "language modelling",
        "natural language processing",
        "ngrams",
        "nlp",
        "pattern recognition",
        "skipgrams"
    ],
    "license": {
        "@id": "http://spdx.org/licenses/GPL-3.0-only",
        "name": "GNU General Public License v3"
    },
    "maintainer": {
        "@id": "https://orcid.org/0000-0002-1046-0006",
        "@type": "Person",
        "affiliation": [
            {
                "@id": "https://www.ru.nl/clst",
                "@type": "Organization",
                "name": "Centre for Language and Speech Technology",
                "parentOrganization": {
                    "@id": "https://www.ru.nl/cls",
                    "@type": "Organization",
                    "name": "Centre for Language Studies",
                    "parentOrganization": {
                        "@id": "https://www.ru.nl",
                        "@type": "Organization",
                        "location": {
                            "@type": "Place",
                            "name": "Nijmegen"
                        },
                        "name": "Radboud University",
                        "url": "https://www.ru.nl"
                    },
                    "url": "https://www.ru.nl/cls"
                },
                "url": "https://www.ru.nl/clst"
            },
            {
                "@id": "https://huc.knaw.nl",
                "@type": "Organization",
                "name": [
                    "Humanities Cluster",
                    "KNAW Humanities Cluster"
                ],
                "parentOrganization": {
                    "@id": "https://knaw.nl",
                    "@type": "Organization",
                    "location": {
                        "@type": "Place",
                        "name": "Amsterdam"
                    },
                    "name": "KNAW",
                    "url": "https://knaw.nl"
                },
                "url": "https://huc.knaw.nl"
            },
            {
                "@id": "https://www.ru.nl/cls",
                "@type": "Organization",
                "name": "Centre for Language Studies",
                "parentOrganization": {
                    "@id": "https://www.ru.nl",
                    "@type": "Organization",
                    "location": {
                        "@type": "Place",
                        "name": "Nijmegen"
                    },
                    "name": "Radboud University",
                    "url": "https://www.ru.nl"
                },
                "url": "https://www.ru.nl/cls"
            },
            {
                "@id": "https://knaw.huc.nl"
            }
        ],
        "email": "proycon@anaproy.nl",
        "familyName": "van Gompel",
        "givenName": "Maarten",
        "position": 4,
        "url": "https://proycon.anaproy.nl"
    },
    "name": "Colibri Core",
    "operatingSystem": [
        "BSD",
        "Linux",
        "macOS"
    ],
    "producer": {
        "@id": "https://www.ru.nl/cls",
        "@type": "Organization",
        "name": "Centre for Language Studies",
        "parentOrganization": {
            "@id": "https://www.ru.nl",
            "@type": "Organization",
            "location": {
                "@type": "Place",
                "name": "Nijmegen"
            },
            "name": "Radboud University",
            "url": "https://www.ru.nl"
        },
        "url": "https://www.ru.nl/cls"
    },
    "programmingLanguage": [
        {
            "@type": "ComputerLanguage",
            "identifier": "c++",
            "name": "C++"
        },
        {
            "@type": "ComputerLanguage",
            "identifier": "cython",
            "name": "Cython"
        }
    ],
    "readme": "https://github.com/proycon/colibri-core/blob/master/README.md",
    "referencePublication": {
        "@id": "https://dx.doi.org/10.5334/jors.105",
        "@type": "TechArticle",
        "author": [
            "Maarten van Gompel",
            "Antal van den Bosch"
        ],
        "isPartOf": {
            "@type": "PublicationIssue",
            "datePublised": "2016",
            "issue": "4",
            "name": "Journal of Open Research Software"
        },
        "name": "Efficient n-gram, Skipgram and Flexgram Modelling with Colibri Core",
        "url": "https://dx.doi.org/10.5334/jors.105"
    },
    "releaseNotes": "https://github.com/proycon/colibri-core/releases",
    "review": {
        "@id": "https://tools.clariah.nl/validation/Nf3b3f01f60ee833ec385322d47984bcb",
        "@type": "Review",
        "author": [
            "codemetapy validator using software.ttl"
        ],
        "datePublished": "2026-06-18 03:04:43",
        "name": "Automatic software metadata validation report for Colibri Core 2.5.10",
        "reviewBody": "Please consult the CLARIAH Software Metadata Requirements at https://github.com/CLARIAH/clariah-plus/blob/main/requirements/software-metadata-requirements.md for an in-depth explanation of any found problems\n\nValidation of Colibri Core 2.5.10 was successful (score=3/5), but there are some warnings which should be addressed:\n\n1. Warning: Documentation *SHOULD* be expressed (The metadata does express this currently, but something is wrong in the way it is expressed. Is the type/class valid?)\n2. Info: The funder *SHOULD* be acknowledged (This is missing in the metadata)\n3. Info: The technology readiness level *SHOULD* be expressed (This is missing in the metadata)\n4. Info: A research domain *SHOULD* be expressed as a category using the NWO Research Fields vocabulary, if applicable (This is missing in the metadata)\n5. Info: A research activity *SHOULD* be expressed as a category using the TaDiRaH vocabulary (This is missing in the metadata)",
        "reviewRating": 3
    },
    "softwareHelp": "https://proycon.github.io/colibri-core/doc/",
    "url": "https://proycon.github.io/colibri-core",
    "version": "2.5.10"
}