From 1fef4ecc6712957c475a39048bf4df49f11a4a47 Mon Sep 17 00:00:00 2001 From: Jonathan Smith <20701-jonathan1055@users.noreply.drupalcode.org> Date: Fri, 20 Dec 2024 10:03:34 +0000 Subject: [PATCH] Issue #3439240 by jonathan1055, fjgarlin, grimreaper: Cspell: sanitize suggested words for dictionary --- assets/.cspell.json | 2 +- assets/internal/.cspell.json | 3 +- docs/jobs/cspell.md | 6 ++- includes/include.drupalci.main.yml | 15 +++++-- includes/include.drupalci.variables.yml | 4 ++ scripts/prepare-cspell.php | 53 ++++++++++++++++++------- 6 files changed, 61 insertions(+), 22 deletions(-) diff --git a/assets/.cspell.json b/assets/.cspell.json index d01861d3..af45fc6c 100644 --- a/assets/.cspell.json +++ b/assets/.cspell.json @@ -1,5 +1,5 @@ { - "description": "This default cspell configuration for contrib projects is based on core/.cspell.json. See https://project.pages.drupalcode.org/gitlab_templates/jobs/cspell/ for more details.", + "description": "This default cspell configuration for contrib projects is based on core/.cspell.json. Some of the arrays are expanded in scripts/prepare-cspell.php. See https://project.pages.drupalcode.org/gitlab_templates/jobs/cspell/ for more details.", "language": "en-US", "allowCompoundWords": false, "globRoot": ".", diff --git a/assets/internal/.cspell.json b/assets/internal/.cspell.json index b818426a..44f95250 100644 --- a/assets/internal/.cspell.json +++ b/assets/internal/.cspell.json @@ -15,7 +15,8 @@ ".git", "vendor", "node_modules", - "assets" + "assets/.cspell.json", + ".cspell.json" ], "dictionaryDefinitions": [ { diff --git a/docs/jobs/cspell.md b/docs/jobs/cspell.md index e5669d9a..1c230e31 100644 --- a/docs/jobs/cspell.md +++ b/docs/jobs/cspell.md @@ -19,7 +19,11 @@ variables: The words should be comma-separated but each word _does not_ need to be quoted individually. The list is not case-sensitive. ### Custom project dictionary -If there are many words in your project that are invented or that are not included in the default dictionaries you can add a `.cspell-project-words.txt` file to your project. Each word should be on a separate line, and blank lines and comments starting with `#` are ignored. [CSpell's Words List Syntax](https://cspell.org/docs/dictionaries-custom/#words-list-syntax) has more details. +If there are many words in your project that are invented or that are not included in the default dictionaries you can add a custom dictionary text file to your project. Each word should be on a separate line, and blank lines and comments starting with `#` are ignored. [CSpell's Words List Syntax](https://cspell.org/docs/dictionaries-custom/#words-list-syntax) has more details. The default name of the project dictionary file is `.cspell-project-words.txt` but you can have a custom name, by defining a `_CSPELL_DICTIONARY` variable: +``` +variables: + _CSPELL_DICTIONARY: 'my-project-dictionary.txt' +``` ### Ignore words specific to one file If a file contains some reported words that are only used in that file, instead of adding them to the project dictionary they can be listed at the top of the file. This is done by adding a special style of comment that CSpell will interpret. The format for a list of words is `cspell:ignore mycustomthing madeupword` diff --git a/includes/include.drupalci.main.yml b/includes/include.drupalci.main.yml index 92699530..3832c859 100644 --- a/includes/include.drupalci.main.yml +++ b/includes/include.drupalci.main.yml @@ -880,6 +880,7 @@ cspell: name: artifacts-$CI_PIPELINE_ID-$CI_JOB_NAME_SLUG paths: - _cspell_unrecognized_words.txt + - _cspell_updated_project_words.txt - _cspell_json.txt script: - echo "Executing curl -OL https://git.drupalcode.org/$_CURL_TEMPLATES_REPO/-/raw/$_CURL_TEMPLATES_REF/scripts/prepare-cspell.php" @@ -901,15 +902,21 @@ cspell: - echo "Executing $CI_PROJECT_DIR/$_WEB_ROOT/core/node_modules/.bin/cspell -c .cspell.json --show-suggestions --show-context --no-progress $_CSPELL_EXTRA $CSPELL_SEARCH" - $CI_PROJECT_DIR/$_WEB_ROOT/core/node_modules/.bin/cspell -c .cspell.json --show-suggestions --show-context --no-progress $_CSPELL_EXTRA $CSPELL_SEARCH || EXIT_CODE=$? - WORDS_FILE=_cspell_unrecognized_words.txt - - touch $WORDS_FILE + - UPDATED_PROJECT_DICTIONARY=_cspell_updated_project_words.txt + - touch $_CSPELL_DICTIONARY $WORDS_FILE $UPDATED_PROJECT_DICTIONARY - | if [ "$EXIT_CODE" != "" ]; then # There are some unrecognized words so create an artifact file containing the unique list. $CI_PROJECT_DIR/$_WEB_ROOT/core/node_modules/.bin/cspell -c .cspell.json --words-only --unique --no-progress $_CSPELL_EXTRA $CSPELL_SEARCH | sort --ignore-case >> $WORDS_FILE || true - echo "The number of unrecognised/misspelled words is $(wc -l < $WORDS_FILE)" - echo "An artifact file has been created containing a list of these unrecognized words, for you to browse or download." + # Convert all words to lower-case and de-duplicate the list. + tr '[:upper:]' '[:lower:]' < $WORDS_FILE | LC_ALL=C sort -u -o $WORDS_FILE + echo "The number of distinct unrecognised/misspelled words is $(wc -l < $WORDS_FILE)" + echo "------------" && cat $WORDS_FILE && echo "------------" + echo "An artifact $WORDS_FILE has been created containing these unrecognized words, for you to browse or download." + # Read the project dictionary and the new unrecognized words and create a new complete project dictionary artifact file. + cat $_CSPELL_DICTIONARY $WORDS_FILE | tr '[:upper:]' '[:lower:]' | LC_ALL=C sort -u -o $UPDATED_PROJECT_DICTIONARY + echo "An artifact $UPDATED_PROJECT_DICTIONARY has been created containing the complete list of words in your project dictionary (if you have one) plus any newly reported words." echo "For hints on getting this CSpell job to pass see https://project.pages.drupalcode.org/gitlab_templates/jobs/cspell/" - echo "=== This is $WORDS_FILE ===" && cat $WORDS_FILE fi - cp .cspell.json _cspell_json.txt - echo "Exiting with EXIT_CODE=$EXIT_CODE" diff --git a/includes/include.drupalci.variables.yml b/includes/include.drupalci.variables.yml index 01992952..473baeb2 100644 --- a/includes/include.drupalci.variables.yml +++ b/includes/include.drupalci.variables.yml @@ -116,6 +116,10 @@ variables: value: '' description: 'A comma-separated list of words to add to the CSpell dictionary. For example `mycustomthing, madeupword`. Quotes are not required.' + _CSPELL_DICTIONARY: + value: '.cspell-project-words.txt' + description: 'The name of the project dictionary of custom words. The default is `.cspell-project-words.txt` but a project can have a custom name if required.' + _CSPELL_FLAGWORDS: value: '' description: 'A comma-separated list of real words in other directories that should not be used. For example Drupal Core has `please` as a Flag Word. Quotes are not required.' diff --git a/scripts/prepare-cspell.php b/scripts/prepare-cspell.php index 90d2a53b..3b93d0be 100644 --- a/scripts/prepare-cspell.php +++ b/scripts/prepare-cspell.php @@ -5,11 +5,21 @@ * @file * Prepares a .cspell.json file customized for the gitlab templates environment. * - * Param 1 = test_suffix (optional) - Additional suffix to append to the input - * filename, before writing out. This is used when running the script locally - * during development, to avoid overwriting the input .cspell.json file. + * Arguments: + * + * -s --suffix Optional suffix to append to the input filename before + * writing out. Useful when running locally during development + * to avoid overwriting the input .cspell.json file. + * + * -v --verbose Show verbose debug output. */ +// Get the arguments. +$options = getopt('s:v', ['suffix', 'verbose']); +$quiet = !array_key_exists('v', $options) && !array_key_exists('verbose', $options); +$suffix = $options['s'] ?? $options['suffix'] ?? ''; +$quiet ?: print '$suffix=' . $suffix . PHP_EOL; + // Get the contents of .cspell.json into an array. This file will be either the // projects own .cspell.json or the default copied from /assets. $cspell_filename = '.cspell.json'; @@ -17,14 +27,10 @@ $cspell_json = json_decode(file_get_contents($cspell_filename), TRUE); if (empty($cspell_json)) { throw new RuntimeException("Unable to read $cspell_filename"); } - -// Allow for easy testing by avoiding overwriting the input file. -$test_suffix = $argv[1] ?? ''; -$cspell_filename .= $test_suffix; - -$webRoot = getenv('_WEB_ROOT') ?: 'web'; +$quiet ?: print 'At start cspell_json=' . print_r($cspell_json, TRUE) . PHP_EOL; // Some directories in the project root are not part of the project. +$webRoot = getenv('_WEB_ROOT') ?: 'web'; $non_project_directories = ["$webRoot", 'vendor', 'node_modules', '.git']; // Specify the files that are always ignored. @@ -60,10 +66,12 @@ $filenames_to_find = [ // // Get the words from $_CSPELL_WORDS. if ($cspell_words = getenv('_CSPELL_WORDS')) { + $quiet ?: print 'Initial $cspell_words=' . $cspell_words . PHP_EOL; // Remove all double quotes and spaces. $cspell_words = str_replace(['"', ' '], ['', ''], $cspell_words); // Remove single quotes from start and end of words, but not from the middle. $words = str_replace([",'", "',"], [',', ','], ',' . $cspell_words . ','); + $quiet ?: print '$words=' . $words . PHP_EOL; } // The module's machine name might not be a real word, so add this. The value of @@ -94,10 +102,11 @@ foreach (new RecursiveIteratorIterator(new RecursiveDirectoryIterator('.', Recur $ignore_standard_files[] = $file->getPathname(); } } +$quiet ?: print '$module_name_parts=' . print_r($module_name_parts, TRUE) . PHP_EOL; // Merge into the existing json 'words' value, but cater for that being empty. // array_values() is needed after array_unique() to restore the keys to numeric. -$cspell_json['words'] = array_values(array_unique(array_merge( +$cspell_json['words'] = array_values(array_filter(array_unique(array_merge( $cspell_json['words'] ?? [], array_filter(explode(',', $words ?? '')), $module_name_parts, @@ -107,7 +116,8 @@ $cspell_json['words'] = array_values(array_unique(array_merge( // Add some common words that were dropped from core dictionary in Drupal 11.1 // See https://www.drupal.org/project/gitlab_templates/issues/3494834 ['endapply', 'nightwatchjs', 'testgroups'], -))); +)))); +$quiet ?: print '$cspell_json[\'words\']=' . print_r($cspell_json['words'], TRUE) . PHP_EOL; // ---------- // Flag Words @@ -115,12 +125,14 @@ $cspell_json['words'] = array_values(array_unique(array_merge( // // Get any flagged words from $_CSPELL_FLAGWORDS. if ($cspell_flagwords = getenv('_CSPELL_FLAGWORDS')) { + $quiet ?: print 'Input $cspell_flagwords=' . $cspell_flagwords . PHP_EOL; // Remove any quotes and spaces. Double quotes are added in json_encode. $cspell_flagwords = str_replace(["'", '"', ' '], ['', '', ''], $cspell_flagwords); $cspell_json['flagWords'] = array_values(array_unique(array_merge( $cspell_json['flagWords'] ?? [], array_filter(explode(',', $cspell_flagwords)), ))); + $quiet ?: print '$cspell_json[\'flagWords\']=' . print_r($cspell_json['flagWords'], TRUE) . PHP_EOL; } // ------------ @@ -163,12 +175,17 @@ $dictionary_definitions = [ 'name' => 'dictionary', 'path' => $webRoot . '/core/misc/cspell/dictionary.txt', ], - [ +]; +if ($project_dictionary = getenv('_CSPELL_DICTIONARY')) { + $quiet ?: print '$project_dictionary=' . $project_dictionary . PHP_EOL; + $dictionary_definitions[] = [ 'name' => 'project-words', - 'path' => './.cspell-project-words.txt', + 'path' => './' . $project_dictionary, 'description' => "The project's own custom dictionary (optional)", - ], -]; + ]; +} +$quiet ?: print 'Initial $dictionary_definitions=' . print_r($dictionary_definitions, TRUE) . PHP_EOL; + $dictionary_names = []; foreach ($dictionary_definitions as $key => $data) { // Add the 'name' if the file exists. Remove the array entry if it does not. @@ -179,6 +196,8 @@ foreach ($dictionary_definitions as $key => $data) { unset($dictionary_definitions[$key]); } } +$quiet ?: print 'After checking files, $dictionary_definitions=' . print_r($dictionary_definitions, TRUE) . PHP_EOL; + // These dictionaries are provided by CSpell. $built_in_dictionaries = [ 'companies', @@ -208,10 +227,14 @@ foreach ($cspell_json['dictionaryDefinitions'] ?? [] as $key => $dic) { } } $cspell_json['dictionaryDefinitions'] = merge_deep($dictionary_definitions, $cspell_json['dictionaryDefinitions'] ?? []); +$quiet ?: print '$cspell_json[\'dictionaryDefinitions\']=' . print_r($cspell_json['dictionaryDefinitions'], TRUE) . PHP_EOL; // --------------------------- // Write out the modified file // --------------------------- +// Allow for easy testing by avoiding overwriting the input file. +$cspell_filename .= $suffix; +$quiet ?: print 'At end $cspell_json=' . print_r($cspell_json, TRUE) . PHP_EOL; print "Writing json array to {$cspell_filename}" . PHP_EOL; file_put_contents($cspell_filename, json_encode($cspell_json, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)); -- GitLab