Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ var TEXT_REGEXP = /^[\x20-\x7e\x80-\xff]+$/
var TOKEN_REGEXP = /^[!#$%&'*+.0-9A-Z^_`a-z|~-]+$/

/**
* RegExp for various RFC 5987 grammar
* RegExp for parsing extended parameter values per RFC 5987.
*
* ext-value = charset "'" [ language ] "'" value-chars
* charset = "UTF-8" / "ISO-8859-1" / mime-charset
Expand All @@ -99,19 +99,20 @@ var TOKEN_REGEXP = /^[!#$%&'*+.0-9A-Z^_`a-z|~-]+$/
* / "!" / "#" / "$" / "%" / "&"
* / "+" / "-" / "^" / "_" / "`"
* / "{" / "}" / "~"
* language = ( 2*3ALPHA [ extlang ] )
* / 4ALPHA
* / 5*8ALPHA
* extlang = *3( "-" 3ALPHA )
*
* language = <Language-Tag as defined in RFC 5646, Section 2.1>
* (Optional: the two literal single quotes MUST appear,
* but the language field inside them may be empty.
* We are ignoring the language content rather than validate it)
*
* value-chars = *( pct-encoded / attr-char )
* pct-encoded = "%" HEXDIG HEXDIG
* attr-char = ALPHA / DIGIT
* / "!" / "#" / "$" / "&" / "+" / "-" / "."
* / "^" / "_" / "`" / "|" / "~"
* @private
*/

var EXT_VALUE_REGEXP = /^([A-Za-z0-9!#$%&+\-^_`{}~]+)'(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3}|[A-Za-z]{4,8}|)'((?:%[0-9A-Fa-f]{2}|[A-Za-z0-9!#$&+.^_`|~-])+)$/
var EXT_VALUE_REGEXP = /([A-Za-z0-9!#$%&+\-^_`{}~]+)'(?:[^']*)'((?:%[0-9A-Fa-f]{2}|[A-Za-z0-9!#$&+.^_`|~-])+)$/
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

@jonchurch jonchurch Mar 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yar, I prefer to break regex down into pieces you can semantically name like:

// Define each part of the regex:
const charset = "([A-Za-z0-9!#$%&+\\-^_`{}~]+)";       // Captures the charset (e.g. "UTF-8")
const openQuote = "'";                                  // Literal opening quote
const language = "(?:[^']*)";                           // Matches any characters except a single quote (ignores language)
const closeQuote = "'";                                 // Literal closing quote
const valueChars = "((?:%[0-9A-Fa-f]{2}|[A-Za-z0-9!#$&+.^_`|~-])+)$"; // Captures the percent-encoded value-chars

// Combine them into the final regex string:
const regexStr = "^" + charset + openQuote + language + closeQuote + valueChars;

// Then compile the regex:
const EXT_VALUE_REGEXP = new RegExp(regexStr);

But it does make it harder to just copy the full thing out

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like to write parsers instead of regex, solves the same problems. But yeah, until someone opens a PR to convert all this regex to a parser I am good with doing what you propose here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the approach Jon proposes, it really helps with readability for people who are not very good with regex.


/**
* RegExp for various RFC 6266 grammar
Expand Down
12 changes: 12 additions & 0 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -380,13 +380,25 @@ describe('contentDisposition.parse(string)', function () {
/unsupported charset/)
})

it('should reject when missing embedded language', function () {
assert.throws(contentDisposition.parse.bind(null, 'attachment; filename*=UTF-8%E2%82%AC%20rates.pdf'),
/invalid extended field value/)
})

it('should parse with embedded language', function () {
assert.deepEqual(contentDisposition.parse('attachment; filename*=UTF-8\'en\'%E2%82%AC%20rates.pdf'), {
type: 'attachment',
parameters: { filename: '€ rates.pdf' }
})
})

it('should parse with embedded language with region subtag', function () {
assert.deepEqual(contentDisposition.parse('attachment; filename*=UTF-8\'en-US\'%E2%82%AC%20rates.pdf'), {
type: 'attachment',
parameters: { filename: '€ rates.pdf' }
})
})

it('should prefer extended parameter value', function () {
assert.deepEqual(contentDisposition.parse('attachment; filename="EURO rates.pdf"; filename*=UTF-8\'\'%E2%82%AC%20rates.pdf'), {
type: 'attachment',
Expand Down