正则表达式精通指南2026

"正则表达式很难"——没错。但一旦掌握，就能减少90%的文本处理时间。

本指南将通过示例介绍实务中最常用的正则表达式模式。

正则表达式基础

基本语法

模式	说明	示例
`.`	任意单个字符	`a.c` → "abc", "a1c"
`*`	0个或多个	`ab*c` → "ac", "abc", "abbc"
`+`	1个或多个	`ab+c` → "abc", "abbc"
`?`	0个或1个	`colou?r` → "color", "colour"
`^`	字符串开头	`^Hello`
`$`	字符串结尾	`world$`
`\d`	数字 [0-9]	`\d{3}` → "123"
`\w`	单词字符 [a-zA-Z0-9_]	`\w+`
`\s`	空白字符	`\s+`

字符类

[abc]     - a, b, c 之一
[^abc]    - 除 a, b, c 之外
[a-z]     - a 到 z
[A-Z]     - A 到 Z
[0-9]     - 0 到 9
[a-zA-Z]  - 所有字母

量词

{n}       - 恰好n个
{n,}      - n个或更多
{n,m}     - n到m个
*         - 0个或更多 ({0,})
+         - 1个或更多 ({1,})
?         - 0或1个 ({0,1})

分组和捕获

(abc)     - 捕获组
(?:abc)   - 非捕获组
(?<name>abc) - 命名组
\1        - 第一个组的反向引用

实战模式：验证

邮箱地址

// 基本模式
const emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;

// 测试
emailRegex.test('user@example.com');  // true
emailRegex.test('invalid-email');      // false

模式分析：

^                     - 开头
[a-zA-Z0-9._%+-]+     - 本地部分（1个或多个字符）
@                     - @ 符号
[a-zA-Z0-9.-]+        - 域名（1个或多个字符）
\.                    - . (转义)
[a-zA-Z]{2,}          - TLD（2个或更多字符）
$                     - 结尾

URL

const urlRegex = /^(https?:\/\/)?([\da-z.-]+)\.([a-z.]{2,6})([/\w .-]*)*\/?$/;

// 测试
urlRegex.test('https://example.com');           // true
urlRegex.test('https://sub.example.com/path');  // true
urlRegex.test('invalid url');                   // false

中国手机号码

// 手机号
const mobileRegex = /^1[3-9]\d{9}$/;

// 测试
mobileRegex.test('13812345678');  // true
mobileRegex.test('12345678901');  // false

密码强度

// 最少8字符，包含大小写、数字、特殊字符
const strongPassword = /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/;

// 模式分析
// (?=.*[a-z])   - 至少1个小写字母（前瞻）
// (?=.*[A-Z])   - 至少1个大写字母
// (?=.*\d)      - 至少1个数字
// (?=.*[@$!%*?&]) - 至少1个特殊字符
// {8,}          - 8个或更多字符

信用卡号

// Visa
const visaRegex = /^4[0-9]{12}(?:[0-9]{3})?$/;

// MasterCard
const mastercardRegex = /^5[1-5][0-9]{14}$/;

// 所有卡（允许连字符）
const cardRegex = /^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$/;

实战模式：文本提取

HTML标签内容提取

const html = '<div class="title">Hello World</div>';

// 提取标签内容
const contentRegex = /<div[^>]*>(.*?)<\/div>/;
const match = html.match(contentRegex);
console.log(match[1]); // "Hello World"

// 删除所有标签
const noTags = html.replace(/<[^>]*>/g, '');
console.log(noTags); // "Hello World"

从URL提取域名

const url = 'https://www.example.com/path/page.html';

const domainRegex = /^(?:https?:\/\/)?(?:www\.)?([^/]+)/;
const domain = url.match(domainRegex)[1];
console.log(domain); // "example.com"

日志文件解析

const logLine = '[2026-02-21 14:30:45] ERROR: Connection timeout at module.js:42';

const logRegex = /\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.+) at (.+):(\d+)/;

const [, datetime, level, message, file, line] = logLine.match(logRegex);

console.log({
  datetime,  // "2026-02-21 14:30:45"
  level,     // "ERROR"
  message,   // "Connection timeout"
  file,      // "module.js"
  line,      // "42"
});

CSV解析（逗号分隔）

const csvLine = 'John,"Doe, Jr.",30,"New York, NY"';

// 按逗号分割（忽略引号内的逗号）
const csvRegex = /(?:^|,)("(?:[^"]*(?:""[^"]*)*)"|[^,]*)/g;

const fields = [];
let match;
while ((match = csvRegex.exec(csvLine)) !== null) {
  fields.push(match[1].replace(/^"|"$/g, '').replace(/""/g, '"'));
}

console.log(fields); // ["John", "Doe, Jr.", "30", "New York, NY"]

实战模式：搜索和替换

电话号码格式化

const phone = '13812345678';

// 添加空格
const formatted = phone.replace(/(\d{3})(\d{4})(\d{4})/, '$1 $2 $3');
console.log(formatted); // "138 1234 5678"

金额格式化

const amount = '1234567890';

// 千位分隔符
const formatted = amount.replace(/\B(?=(\d{3})+(?!\d))/g, ',');
console.log(formatted); // "1,234,567,890"

脱敏

// 邮箱脱敏
const email = 'username@example.com';
const masked = email.replace(/(.{2})(.*)(@.*)/, '$1***$3');
console.log(masked); // "us***@example.com"

// 电话号码脱敏
const phone = '138-1234-5678';
const maskedPhone = phone.replace(/(\d{3})-(\d{4})-(\d{4})/, '$1-****-$3');
console.log(maskedPhone); // "138-****-5678"

空白清理

const text = '  Hello    World  ';

// 去除前后空白（trim）
const trimmed = text.replace(/^\s+|\s+$/g, '');

// 多个空白变为一个
const normalized = text.replace(/\s+/g, ' ').trim();
console.log(normalized); // "Hello World"

高级功能

前瞻（Lookahead）

// 正向前瞻: (?=...)
// "foo"后面跟着"bar"时才匹配
const regex = /foo(?=bar)/;
'foobar'.match(regex);  // ["foo"]
'foobaz'.match(regex);  // null

// 负向前瞻: (?!...)
// "foo"后面不跟着"bar"时才匹配
const regex2 = /foo(?!bar)/;
'foobaz'.match(regex2); // ["foo"]
'foobar'.match(regex2); // null

后顾（Lookbehind）

// 正向后顾: (?<=...)
// "$"前面的数字
const priceRegex = /(?<=\$)\d+/g;
'$100 and $200'.match(priceRegex); // ["100", "200"]

// 负向后顾: (?<!...)
// 前面没有"$"的数字
const nonPriceRegex = /(?<!\$)\d+/g;
'$100 and 200'.match(nonPriceRegex); // ["200"]

命名组（Named Groups）

const dateRegex = /(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2})/;
const match = '2026-02-21'.match(dateRegex);

console.log(match.groups.year);  // "2026"
console.log(match.groups.month); // "02"
console.log(match.groups.day);   // "21"

非贪婪（Non-Greedy）

const html = '<div>Hello</div><div>World</div>';

// 贪婪（默认）：尽可能多地匹配
/<div>.*<\/div>/.exec(html)[0];
// "<div>Hello</div><div>World</div>"

// 非贪婪：尽可能少地匹配
/<div>.*?<\/div>/.exec(html)[0];
// "<div>Hello</div>"

语言实现

JavaScript

// 创建方式
const regex1 = /pattern/flags;
const regex2 = new RegExp('pattern', 'flags');

// 方法
regex.test(str);        // boolean
str.match(regex);       // 数组或null
str.replace(regex, replacement);
str.split(regex);
regex.exec(str);        // 详细匹配信息

// 标志
// g: 全局搜索
// i: 忽略大小写
// m: 多行
// s: dotAll (.匹配换行)
// u: Unicode

Python

import re

# 编译
pattern = re.compile(r'pattern', re.IGNORECASE)

# 方法
re.match(pattern, string)   # 从头匹配
re.search(pattern, string)  # 全文搜索
re.findall(pattern, string) # 所有匹配列表
re.sub(pattern, repl, string) # 替换

# 示例
emails = re.findall(r'[\w.+-]+@[\w.-]+\.\w+', text)
cleaned = re.sub(r'\s+', ' ', text).strip()

Go

import "regexp"

// 编译
re := regexp.MustCompile(`\d+`)

// 方法
re.MatchString(s)          // bool
re.FindString(s)           // 第一个匹配
re.FindAllString(s, -1)    // 所有匹配
re.ReplaceAllString(s, r)  // 替换

// 示例
numbers := re.FindAllString("a1b2c3", -1)
// ["1", "2", "3"]

性能优化

复用编译后的正则

// 慢：每次编译
function validate(email) {
  return /^[\w.+-]+@[\w.-]+\.\w+$/.test(email);
}

// 快：编译一次
const emailRegex = /^[\w.+-]+@[\w.-]+\.\w+$/;
function validate(email) {
  return emailRegex.test(email);
}

使用非捕获组

// 不必要的捕获
const regex = /(https?):\/\/(www\.)?(.+)/;

// 不需要的组用非捕获
const regex = /(?:https?):\/\/(?:www\.)?(.+)/;

避免灾难性回溯

// 危险：(a+)+ 模式
const bad = /^(a+)+$/;
'aaaaaaaaaaaaaaaaaaaaaa!'.match(bad); // 非常慢

// 安全：调整结构
const good = /^a+$/;

调试技巧

使用Regex Tester

输入模式
输入测试字符串
查看高亮匹配结果
确认捕获组

分步验证

// 分步验证复杂模式
const parts = [
  '^',                    // 开头
  '[a-zA-Z0-9._%+-]+',    // 本地部分
  '@',                    // @
  '[a-zA-Z0-9.-]+',       // 域名
  '\\.',                  // .
  '[a-zA-Z]{2,}',         // TLD
  '$',                    // 结尾
];

const emailRegex = new RegExp(parts.join(''));

常见问题

Q1: `.` 和 `\.` 有什么区别？

A：.匹配任意字符（通配符），\.匹配字面上的点。

Q2: `*` 和 `+` 有什么区别？

A：*匹配0个或更多，+匹配1个或更多。

ab*c → "ac", "abc", "abbc"
ab+c → "abc", "abbc"（"ac"不匹配）

Q3: 如何忽略大小写？

A：使用i标志。

JavaScript: /pattern/i
Python: re.IGNORECASE

Q4: 如何让`.`也匹配换行？

A：使用s标志（dotAll）。

JavaScript: /pattern/s
Python: re.DOTALL

Q5: 哪些字符需要转义？

A：\ ^ $ . | ? * + ( ) [ ] { } 这些字符需要用\转义。

总结

正则表达式要点：

基本语法：., *, +, ?, [], ()
元字符：\d, \w, \s, ^, $
量词：{n}, {n,}, {n,m}
高级功能：前瞻、后顾、命名组

熟能生巧。请在Regex Tester实时测试。

相关工具

工具	用途
Regex Tester	正则表达式测试和调试
JSON Formatter	JSON格式化