| // Copyright 2018 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |
| #define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |
| |
| #include <memory> |
| #include <string> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/macros.h" |
| #include "third_party/cld_3/src/src/nnet_language_identifier.h" |
| #include "ui/accessibility/ax_enums.mojom-forward.h" |
| #include "ui/accessibility/ax_export.h" |
| #include "ui/accessibility/ax_tree_observer.h" |
| |
| namespace ui { |
| |
| class AXNode; |
| class AXTree; |
| |
| // This module implements language detection enabling Chrome to automatically |
| // detect the language for runs of text within the page. |
| // |
| // Node-level language detection runs once per page after the load complete |
| // event. This involves two passes: |
| // *Detect* walks the tree from the given root using cld3 to detect up to 3 |
| // potential languages per node. A ranked list is created enumerating |
| // all potential languages on a page. |
| // *Label* re-walks the tree, assigning a language to each node considering |
| // the potential languages from the detect phase, page level |
| // statistics, and the assigned languages of ancestor nodes. |
| // |
| // Optionally an embedder may run *sub-node* language detection which attempts |
| // to assign languages for runs of text within a node, potentially down to the |
| // individual character level. This is useful in cases where a single paragraph |
| // involves switching between multiple languages, and where the speech engine |
| // doesn't automatically switch voices to handle different character sets. |
| // Due to the potentially small lengths of text runs involved this tends to be |
| // lower in accuracy, and works best when a node is composed of multiple |
| // languages with easily distinguishable scripts. |
| |
| // AXLanguageInfo represents the local language detection data for all text |
| // within an AXNode. Stored on AXNode. |
| struct AX_EXPORT AXLanguageInfo { |
| AXLanguageInfo(); |
| ~AXLanguageInfo(); |
| |
| // This is the final language we have assigned for this node during the |
| // 'label' step, it is the result of merging: |
| // a) The detected language for this node |
| // b) The declared lang attribute on this node |
| // c) the (recursive) language of the parent (detected or declared). |
| // |
| // This will be the empty string if no language was assigned during label |
| // phase. |
| // |
| // IETF BCP 47 Language code (rfc5646). |
| // examples: |
| // 'de' |
| // 'de-DE' |
| // 'en' |
| // 'en-US' |
| // 'es-ES' |
| // |
| // This should not be read directly by clients of AXNode, instead clients |
| // should call AXNode::GetLanguage(). |
| // TODO(chrishall): consider renaming this to `assigned_language`. |
| std::string language; |
| |
| // Detected languages for this node sorted as returned by |
| // FindTopNMostFreqLangs, which sorts in decreasing order of probability, |
| // filtered to remove any unreliable results. |
| std::vector<std::string> detected_languages; |
| }; |
| |
| // Each AXLanguageSpan contains a language, a probability, and start and end |
| // indices. The indices are used to specify the substring that contains the |
| // associated language. The string which the indices are relative to is not |
| // included in this structure. |
| // Also, the indices are relative to a Utf8 string. |
| // See documentation on GetLanguageAnnotationForStringAttribute for details |
| // on how to associate this object with a string. |
| struct AX_EXPORT AXLanguageSpan { |
| int start_index; |
| int end_index; |
| std::string language; |
| float probability; |
| }; |
| |
| // A single AXLanguageInfoStats instance is stored on each AXTree and contains |
| // statistics on detected languages for all the AXNodes in that tree. |
| // |
| // We rely on these tree-level statistics when labelling individual nodes, to |
| // provide extra signals to increase our confidence in assigning a detected |
| // language. |
| // |
| // These tree level statistics are also used to send reports on the language |
| // detection feature to enable tuning. |
| // |
| // The Label step will only assign a detected language to a node if that |
| // language is one of the most frequent languages on the page. |
| // |
| // For example, if a single node has detected_languages (in order of probability |
| // assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall |
| // indicate that the page is generally in en-AU and ja-JP, it is more likely to |
| // be a mis-recognition of Danish than an accurate assignment, so we assign |
| // en-AU instead of da-DK. |
| class AX_EXPORT AXLanguageInfoStats { |
| public: |
| AXLanguageInfoStats(); |
| ~AXLanguageInfoStats(); |
| |
| // Each AXLanguageInfoStats is tied to a specific AXTree, copying is safe but |
| // logically doesn't make sense. |
| AXLanguageInfoStats(const AXLanguageInfoStats&) = delete; |
| AXLanguageInfoStats& operator=(const AXLanguageInfoStats&) = delete; |
| |
| // Adjust our statistics to add provided detected languages. |
| void Add(const std::vector<std::string>& languages); |
| |
| // Fetch the score for a given language. |
| int GetScore(const std::string& lang) const; |
| |
| // Check if a given language is within the top results. |
| bool CheckLanguageWithinTop(const std::string& lang); |
| |
| // Record statistics based on how we labelled a node. |
| // We consider the language we labelled the node with, the language the author |
| // assigned, and whether or not we assigned our highest confidence detection |
| // result. |
| void RecordLabelStatistics(const std::string& labelled_lang, |
| const std::string& author_lang, |
| bool labelled_with_first_result); |
| |
| // Update metrics to reflect we attempted to detect language for a node. |
| void RecordDetectionAttempt(); |
| |
| // Report metrics to UMA. |
| // Reports statistics since last run, run once detect & label iteration. |
| // If successful, will reset statistics. |
| void ReportMetrics(); |
| |
| private: |
| // Allow access from a fixture only used in testing. |
| friend class AXLanguageDetectionTestFixture; |
| |
| // Store a count of the occurrences of a given language. |
| std::unordered_map<std::string, int> lang_counts_; |
| |
| // Cache of last calculated top language results. |
| // A vector of pairs of (score, language) sorted by descending score. |
| std::vector<std::pair<int, std::string>> top_results_; |
| |
| // Boolean recording that we have not mutated the statistics since last |
| // calculating top results, setting this to false will cause recalculation |
| // when the results are next fetched. |
| bool top_results_valid_; |
| |
| // Invalidate the top results cache. |
| void InvalidateTopResults(); |
| |
| // Compute the top results and store them in cache. |
| void GenerateTopResults(); |
| |
| // TODO(chrishall): Do we want this for testing? or is it better to only test |
| // the generated metrics by inspecting the histogram? |
| // Boolean used for testing metrics only, disables clearing of metrics. |
| bool disable_metric_clearing_; |
| void ClearMetrics(); |
| |
| // *** Statistics recorded for metric reporting. *** |
| // All statistics represent a single iteration of language detection and are |
| // reset after each successful call of ReportMetrics. |
| |
| // The number of nodes we attempted detection on. |
| int count_detection_attempted_; |
| |
| // The number of nodes we got detection results for. |
| int count_detection_results_; |
| |
| // The number of nodes we assigned a label to. |
| int count_labelled_; |
| |
| // The number of nodes we assigned a label to which was the highest confident |
| // detected language. |
| int count_labelled_with_top_result_; |
| |
| // The number of times we labelled a language which disagreed with the node's |
| // author provided language annotation. |
| // |
| // If we have |
| // <div lang='en'><span>...</span><span>...</span></div> |
| // and we detect and label both spans as having language 'fr', then we count |
| // this as `2` overrides. |
| int count_overridden_; |
| |
| // Set of top language detected for every node, used to generate the unique |
| // number of detected languages metric (LangsPerPage). |
| std::unordered_set<std::string> unique_top_lang_detected_; |
| }; |
| |
| // AXLanguageDetectionObserver is registered as a change observer on an AXTree |
| // and will run language detection after each update to the tree. |
| // |
| // We have kept this observer separate from the AXLanguageDetectionManager as we |
| // are aiming to launch language detection in two phases and wanted to try keep |
| // the code paths somewhat separate. |
| // |
| // TODO(chrishall): After both features have launched we could consider merging |
| // AXLanguageDetectionObserver into AXLanguageDetectionManager. |
| // |
| // TODO(chrishall): Investigate the cost of using AXTreeObserver, given that it |
| // has many empty virtual methods which are called for every AXTree change and |
| // we are only currently interested in OnAtomicUpdateFinished. |
| class AX_EXPORT AXLanguageDetectionObserver : public ui::AXTreeObserver { |
| public: |
| // Observer constructor will register itself with the provided AXTree. |
| AXLanguageDetectionObserver(AXTree* tree); |
| |
| // Observer destructor will remove itself as an observer from the AXTree. |
| ~AXLanguageDetectionObserver() override; |
| |
| // AXLanguageDetectionObserver contains a pointer so copying is non-trivial. |
| AXLanguageDetectionObserver(const AXLanguageDetectionObserver&) = delete; |
| AXLanguageDetectionObserver& operator=(const AXLanguageDetectionObserver&) = |
| delete; |
| |
| private: |
| void OnAtomicUpdateFinished(ui::AXTree* tree, |
| bool root_changed, |
| const std::vector<Change>& changes) override; |
| |
| // Non-owning pointer to AXTree, used to de-register observer on destruction. |
| AXTree* const tree_; |
| }; |
| |
| // AXLanguageDetectionManager manages all of the context needed for language |
| // detection within an AXTree. |
| class AX_EXPORT AXLanguageDetectionManager { |
| public: |
| // Construct an AXLanguageDetectionManager for the specified tree. |
| explicit AXLanguageDetectionManager(AXTree* tree); |
| ~AXLanguageDetectionManager(); |
| |
| // AXLanguageDetectionManager contains pointers so copying is non-trivial. |
| AXLanguageDetectionManager(const AXLanguageDetectionManager&) = delete; |
| AXLanguageDetectionManager& operator=(const AXLanguageDetectionManager&) = |
| delete; |
| |
| // Detect languages for each node in the tree managed by this manager. |
| // This is the first pass in detection and labelling. |
| // This only detects the language, it does not label it, for that see |
| // LabelLanguageForSubtree. |
| void DetectLanguages(); |
| |
| // Label languages for each node in the tree manager by this manager. |
| // This is the second pass in detection and labelling. |
| // This will label the language, but relies on the earlier detection phase |
| // having already completed. |
| void LabelLanguages(); |
| |
| // Sub-node language detection for a given string attribute. |
| // For example, if a node has name: "My name is Fred", then calling |
| // GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute:: |
| // kName) would return language detection information about "My name is Fred". |
| std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute( |
| const AXNode& node, |
| ax::mojom::StringAttribute attr); |
| |
| // Construct and register a dynamic content change observer for this manager. |
| void RegisterLanguageDetectionObserver(); |
| |
| private: |
| friend class AXLanguageDetectionObserver; |
| |
| // Allow access from a fixture only used in testing. |
| friend class AXLanguageDetectionTestFixture; |
| |
| // Helper methods to test if language detection features are enabled. |
| static bool IsStaticLanguageDetectionEnabled(); |
| static bool IsDynamicLanguageDetectionEnabled(); |
| |
| // Perform detection for subtree rooted at subtree_root. |
| void DetectLanguagesForSubtree(AXNode* subtree_root); |
| // Perform detection for node. Will not descend into children. |
| void DetectLanguagesForNode(AXNode* node); |
| // Perform labelling for subtree rooted at subtree_root. |
| void LabelLanguagesForSubtree(AXNode* subtree_root); |
| // Perform labelling for node. Will not descend into children. |
| void LabelLanguagesForNode(AXNode* node); |
| |
| // This language identifier is constructed with a default minimum byte length |
| // of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is |
| // used for detecting page-level languages. |
| chrome_lang_id::NNetLanguageIdentifier language_identifier_; |
| |
| // This language identifier is constructed with a minimum byte length of |
| // kShortTextIdentifierMinByteLength so it can be used for detecting languages |
| // of shorter text (e.g. one character). |
| chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_; |
| |
| // The observer to support dynamic content language detection. |
| std::unique_ptr<AXLanguageDetectionObserver> language_detection_observer_; |
| |
| // Non-owning back pointer to the tree which owns this manager. |
| AXTree* tree_; |
| |
| AXLanguageInfoStats lang_info_stats_; |
| }; |
| |
| } // namespace ui |
| |
| #endif // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ |