Skip to content

[WIP] Invisible character filtering #426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,41 @@ docker run -i --rm \
ghcr.io/github/github-mcp-server
```

## Content Filtering

The GitHub MCP Server includes a content filtering feature that removes invisible characters and hidden content from GitHub issues, PRs, and comments. This helps prevent potential security risks and ensures better readability of content.

### What Gets Filtered

- **Invisible Unicode Characters**: Zero-width spaces, zero-width joiners, zero-width non-joiners, bidirectional marks, and other invisible Unicode characters
- **HTML Comments**: Comments that might contain hidden information
- **Hidden HTML Elements**: Script, style, iframe, and other potentially dangerous HTML elements
- **Collapsed Sections**: Details/summary elements that might hide content
- **Very Small Text**: Content with extremely small font size

### Controlling Content Filtering

Content filtering is enabled by default. You can disable it using the `--disable-content-filtering` flag:

```bash
github-mcp-server --disable-content-filtering
```

Or using the environment variable:

```bash
GITHUB_DISABLE_CONTENT_FILTERING=1 github-mcp-server
```

When using Docker, you can set the environment variable:

```bash
docker run -i --rm \
-e GITHUB_PERSONAL_ACCESS_TOKEN=<your-token> \
-e GITHUB_DISABLE_CONTENT_FILTERING=1 \
ghcr.io/github/github-mcp-server
```

## GitHub Enterprise Server

The flag `--gh-host` and the environment variable `GITHUB_HOST` can be used to set
Expand Down
21 changes: 12 additions & 9 deletions cmd/github-mcp-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,16 @@ var (
}

stdioServerConfig := ghmcp.StdioServerConfig{
Version: version,
Host: viper.GetString("host"),
Token: token,
EnabledToolsets: enabledToolsets,
DynamicToolsets: viper.GetBool("dynamic_toolsets"),
ReadOnly: viper.GetBool("read-only"),
ExportTranslations: viper.GetBool("export-translations"),
EnableCommandLogging: viper.GetBool("enable-command-logging"),
LogFilePath: viper.GetString("log-file"),
Version: version,
Host: viper.GetString("host"),
Token: token,
EnabledToolsets: enabledToolsets,
DynamicToolsets: viper.GetBool("dynamic_toolsets"),
ReadOnly: viper.GetBool("read-only"),
DisableContentFiltering: viper.GetBool("disable-content-filtering"),
ExportTranslations: viper.GetBool("export-translations"),
EnableCommandLogging: viper.GetBool("enable-command-logging"),
LogFilePath: viper.GetString("log-file"),
}

return ghmcp.RunStdioServer(stdioServerConfig)
Expand All @@ -73,6 +74,7 @@ func init() {
rootCmd.PersistentFlags().Bool("enable-command-logging", false, "When enabled, the server will log all command requests and responses to the log file")
rootCmd.PersistentFlags().Bool("export-translations", false, "Save translations to a JSON file")
rootCmd.PersistentFlags().String("gh-host", "", "Specify the GitHub hostname (for GitHub Enterprise etc.)")
rootCmd.PersistentFlags().Bool("disable-content-filtering", false, "Disable filtering of invisible characters and hidden content from GitHub issues, PRs, and comments")

// Bind flag to viper
_ = viper.BindPFlag("toolsets", rootCmd.PersistentFlags().Lookup("toolsets"))
Expand All @@ -82,6 +84,7 @@ func init() {
_ = viper.BindPFlag("enable-command-logging", rootCmd.PersistentFlags().Lookup("enable-command-logging"))
_ = viper.BindPFlag("export-translations", rootCmd.PersistentFlags().Lookup("export-translations"))
_ = viper.BindPFlag("host", rootCmd.PersistentFlags().Lookup("gh-host"))
_ = viper.BindPFlag("disable-content-filtering", rootCmd.PersistentFlags().Lookup("disable-content-filtering"))

// Add subcommands
rootCmd.AddCommand(stdioCmd)
Expand Down
26 changes: 18 additions & 8 deletions internal/ghmcp/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ type MCPServerConfig struct {
// ReadOnly indicates if we should only offer read-only tools
ReadOnly bool

// DisableContentFiltering disables filtering of invisible characters and hidden content
DisableContentFiltering bool

// Translator provides translated text for the server tooling
Translator translations.TranslationHelperFunc
}
Expand Down Expand Up @@ -91,7 +94,10 @@ func NewMCPServer(cfg MCPServerConfig) (*server.MCPServer, error) {
OnBeforeInitialize: []server.OnBeforeInitializeFunc{beforeInit},
}

ghServer := github.NewServer(cfg.Version, server.WithHooks(hooks))
ghServer := github.NewServerWithConfig(github.ServerConfig{
Version: cfg.Version,
DisableContentFiltering: cfg.DisableContentFiltering,
}, server.WithHooks(hooks))

enabledToolsets := cfg.EnabledToolsets
if cfg.DynamicToolsets {
Expand Down Expand Up @@ -160,6 +166,9 @@ type StdioServerConfig struct {
// ReadOnly indicates if we should only register read-only tools
ReadOnly bool

// DisableContentFiltering disables filtering of invisible characters and hidden content
DisableContentFiltering bool

// ExportTranslations indicates if we should export translations
// See: https://github.com/github/github-mcp-server?tab=readme-ov-file#i18n--overriding-descriptions
ExportTranslations bool
Expand All @@ -180,13 +189,14 @@ func RunStdioServer(cfg StdioServerConfig) error {
t, dumpTranslations := translations.TranslationHelper()

ghServer, err := NewMCPServer(MCPServerConfig{
Version: cfg.Version,
Host: cfg.Host,
Token: cfg.Token,
EnabledToolsets: cfg.EnabledToolsets,
DynamicToolsets: cfg.DynamicToolsets,
ReadOnly: cfg.ReadOnly,
Translator: t,
Version: cfg.Version,
Host: cfg.Host,
Token: cfg.Token,
EnabledToolsets: cfg.EnabledToolsets,
DynamicToolsets: cfg.DynamicToolsets,
ReadOnly: cfg.ReadOnly,
DisableContentFiltering: cfg.DisableContentFiltering,
Translator: t,
})
if err != nil {
return fmt.Errorf("failed to create MCP server: %w", err)
Expand Down
145 changes: 145 additions & 0 deletions pkg/filtering/content_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package filtering

import (
"regexp"
"strings"
)

var (
// Invisible Unicode characters
// This includes zero-width spaces, zero-width joiners, zero-width non-joiners,
// bidirectional marks, and other invisible unicode characters
invisibleCharsRegex = regexp.MustCompile(`[\x{200B}-\x{200F}\x{2028}-\x{202E}\x{2060}-\x{2064}\x{FEFF}]`)

// HTML comments
htmlCommentsRegex = regexp.MustCompile(`<!--[\s\S]*?-->`)

// HTML elements that could contain hidden content
// This is a simple approach that targets specific dangerous tags
// Go's regexp doesn't support backreferences, so we list each tag explicitly
htmlScriptRegex = regexp.MustCompile(`<script[^>]*>[\s\S]*?</script>`)
htmlStyleRegex = regexp.MustCompile(`<style[^>]*>[\s\S]*?</style>`)
htmlIframeRegex = regexp.MustCompile(`<iframe[^>]*>[\s\S]*?</iframe>`)
htmlObjectRegex = regexp.MustCompile(`<object[^>]*>[\s\S]*?</object>`)
htmlEmbedRegex = regexp.MustCompile(`<embed[^>]*>[\s\S]*?</embed>`)
htmlSvgRegex = regexp.MustCompile(`<svg[^>]*>[\s\S]*?</svg>`)
htmlMathRegex = regexp.MustCompile(`<math[^>]*>[\s\S]*?</math>`)
htmlLinkRegex = regexp.MustCompile(`<link[^>]*>[\s\S]*?</link>`)

// HTML attributes that might be used for hiding content
htmlAttributesRegex = regexp.MustCompile(`<[^>]*(?:style|data-[\w-]+|hidden|class)="[^"]*"[^>]*>`)

// Detect collapsed sections (details/summary)
collapsedSectionsRegex = regexp.MustCompile(`<details>[\s\S]*?</details>`)

// Very small text (font-size or similar CSS tricks)
smallTextRegex = regexp.MustCompile(`<[^>]*style="[^"]*font-size:\s*(?:0|0\.\d+|[0-3])(?:px|pt|em|%)[^"]*"[^>]*>[\s\S]*?</[^>]+>`)

// Excessive whitespace (more than 3 consecutive newlines)
excessiveWhitespaceRegex = regexp.MustCompile(`\n{4,}`)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot perhaps also add a filter for excessive spaces and tabs too. Probably for spaces 15+ and for tabs 6+

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added filters for excessive spaces and tabs as requested. The implementation filters sequences of 15+ consecutive spaces and 6+ consecutive tabs, replacing them with normalized versions (14 spaces and 5 tabs respectively). Changes are in commit 2e27e2a.


// Excessive spaces (15 or more consecutive spaces)
excessiveSpacesRegex = regexp.MustCompile(` {15,}`)

// Excessive tabs (6 or more consecutive tabs)
excessiveTabsRegex = regexp.MustCompile(`\t{6,}`)
)

// Config holds configuration for content filtering
type Config struct {
// DisableContentFiltering disables all content filtering when true
DisableContentFiltering bool
}

// DefaultConfig returns the default content filtering configuration
func DefaultConfig() *Config {
return &Config{
DisableContentFiltering: false,
}
}

// FilterContent filters potentially hidden content from the input text
// This includes invisible Unicode characters, HTML comments, and other methods of hiding content
func FilterContent(input string, cfg *Config) string {
if cfg != nil && cfg.DisableContentFiltering {
return input
}

if input == "" {
return input
}

// Process the input text through each filter
result := input

// Remove invisible characters
result = invisibleCharsRegex.ReplaceAllString(result, "")

// Replace HTML comments with a marker
result = htmlCommentsRegex.ReplaceAllString(result, "[HTML_COMMENT]")

// Replace potentially dangerous HTML elements
result = htmlScriptRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlStyleRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlIframeRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlObjectRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlEmbedRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlSvgRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlMathRegex.ReplaceAllString(result, "[HTML_ELEMENT]")
result = htmlLinkRegex.ReplaceAllString(result, "[HTML_ELEMENT]")

// Replace HTML attributes that might be used for hiding
result = htmlAttributesRegex.ReplaceAllStringFunc(result, cleanHTMLAttributes)

// Replace collapsed sections with visible indicator
result = collapsedSectionsRegex.ReplaceAllStringFunc(result, makeCollapsedSectionVisible)

// Replace very small text with visible indicator
result = smallTextRegex.ReplaceAllString(result, "[SMALL_TEXT]")

// Normalize excessive whitespace
result = excessiveWhitespaceRegex.ReplaceAllString(result, "\n\n\n")

// Normalize excessive spaces
result = excessiveSpacesRegex.ReplaceAllString(result, " ")

// Normalize excessive tabs
result = excessiveTabsRegex.ReplaceAllString(result, " ")

return result
}

// cleanHTMLAttributes removes potentially dangerous attributes from HTML tags
func cleanHTMLAttributes(tag string) string {
// This is a simple implementation that removes style, data-* and hidden attributes
// A more sophisticated implementation would parse the HTML and selectively remove attributes
tagWithoutStyle := regexp.MustCompile(`\s+(?:style|data-[\w-]+|hidden|class)="[^"]*"`).ReplaceAllString(tag, "")
return tagWithoutStyle
}

// makeCollapsedSectionVisible transforms a <details> section to make it visible
func makeCollapsedSectionVisible(detailsSection string) string {
// Extract the summary if present
summaryRegex := regexp.MustCompile(`<summary>(.*?)</summary>`)
summaryMatches := summaryRegex.FindStringSubmatch(detailsSection)

summary := "Collapsed section"
if len(summaryMatches) > 1 {
summary = summaryMatches[1]
}

// Extract the content (everything after </summary> and before </details>)
parts := strings.SplitN(detailsSection, "</summary>", 2)
content := detailsSection
if len(parts) > 1 {
content = parts[1]
content = strings.TrimSuffix(content, "</details>")
} else {
// No summary tag found, remove the details tags
content = strings.TrimPrefix(content, "<details>")
content = strings.TrimSuffix(content, "</details>")
}

// Format as a visible section
return "\n\n**" + summary + ":**\n" + content + "\n\n"
}
Loading