第7章：Python re 模块 — 正则表达式

模块函数全览

re 模块是 Python 标准库中的正则表达式引擎，基于 NFA 实现，支持 Perl 风格的正则语法。

import re

# 主要函数一览
re.match()      # 从字符串开头匹配（不是全字符串）
re.search()     # 在整个字符串中搜索第一个匹配
re.findall()    # 找到所有匹配，返回字符串列表
re.finditer()   # 找到所有匹配，返回 Match 对象迭代器
re.sub()        # 替换所有（或指定次数的）匹配
re.subn()       # 替换并返回 (新字符串, 替换次数) 元组
re.split()      # 按正则分割字符串
re.compile()    # 预编译正则，返回 Pattern 对象
re.fullmatch()  # 要求整个字符串完全匹配（Python 3.4+）
re.escape()     # 转义字符串中的所有特殊字符
re.purge()      # 清空编译缓存（一般不需要手动调用）

三个核心函数的区别

match、search、fullmatch 是最常混淆的三个函数：

re.match(pattern, string)

只从字符串开头开始尝试匹配。如果开头不匹配，即使后面有匹配也返回 None。等价于给模式自动加了 ^ 锚点，但不加 $——因此 re.match(r'\d+', '123abc') 会匹配 '123'（不要求到末尾）。

re.search(pattern, string)

在整个字符串中搜索第一个匹配位置。如果任意位置有匹配，返回第一个 Match 对象。是最通用的搜索函数，等价于 Perl 的默认匹配行为。

re.fullmatch(pattern, string)

要求整个字符串完全匹配模式。等价于给模式两端自动加 ^ 和 $。用于验证格式时最合适（如验证邮箱、日期格式是否完全合法）。

import re

s = 'hello world 123'

# match：只从开头
re.match(r'\d+', s)          # None（'hello' 不是数字）
re.match(r'hello', s)        # 匹配 'hello'
re.match(r'hello', s).span() # (0, 5)

# search：搜索整个字符串
re.search(r'\d+', s)          # 匹配 '123'（在字符串末尾）
re.search(r'\d+', s).group()  # '123'

# fullmatch：整个字符串必须完全匹配
re.fullmatch(r'hello.*', s)   # 匹配（.* 可以匹配剩余部分）
re.fullmatch(r'\d+', s)       # None（整串不只是数字）
re.fullmatch(r'\d+', '12345') # 匹配（整串都是数字）

# 实用建议
# 验证格式用 fullmatch，搜索内容用 search，几乎不用 match

Match 对象的完整 API

m = re.search(r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',
              '今天是 2026-03-26 星期四')

# 获取匹配内容
m.group()        # '2026-03-26'  整个匹配（等价于 m.group(0)）
m.group(1)       # '2026'        第1个捕获组
m.group(2)       # '03'
m.group(3)       # '26'
m.group('year') # '2026'        通过名字访问
m.groups()       # ('2026', '03', '26')  所有捕获组元组
m.groupdict()    # {'year': '2026', 'month': '03', 'day': '26'}

# 获取位置信息
m.start()        # 4   整个匹配的开始位置（字符串索引）
m.end()          # 14  整个匹配的结束位置（不包含）
m.span()         # (4, 14)  (start, end) 元组
m.start(1)       # 4   第1组的开始位置
m.span('month') # (9, 11) 'month' 组的位置

# 原始信息
m.string         # '今天是 2026-03-26 星期四'  原始字符串
m.re             # 编译后的 Pattern 对象
m.lastindex      # 3  最后匹配的捕获组编号
m.lastgroup      # 'day'  最后匹配的捕获组名字

findall 的行为规则

findall 是最常用的批量提取函数，但其返回值会随捕获组的数量变化：

# 规则1：无捕获组 → 返回整个匹配的字符串列表
re.findall(r'\d+', 'a1 b22 c333')
# → ['1', '22', '333']

# 规则2：一个捕获组 → 返回该组内容的列表
re.findall(r'(\d+)', 'a1 b22 c333')
# → ['1', '22', '333']  （与无捕获组相同，因为组=整体）

re.findall(r'\w+=(\d+)', 'x=1 y=22 z=333')
# → ['1', '22', '333']  （只返回 = 后的数字，不含 x=）

# 规则3：多个捕获组 → 返回元组列表
re.findall(r'(\w+)=(\d+)', 'x=1 y=22 z=333')
# → [('x', '1'), ('y', '22'), ('z', '333')]

# 常见陷阱：用捕获组时返回值改变
re.findall(r'(<.+?>)', '<b>text</b>')
# → ['<b>', '</b>']  （有捕获组，返回组内容——恰好等于整体）

re.findall(r'<(.+?)>', '<b>text</b>')
# → ['b', '/b']  （捕获组不含尖括号，只有标签名）

finditer：内存高效的迭代

# findall 一次性返回所有结果（大文件时内存占用大）
# finditer 返回迭代器，按需生成 Match 对象（内存友好）

for m in re.finditer(r'\d+', 'a1 b22 c333'):
    print(f'{m.group()} at position {m.start()}')
# 1 at position 1
# 22 at position 4
# 333 at position 8

# 处理大日志文件（不占用太多内存）
with open('access.log') as f:
    content = f.read()
    errors = [m.group() for m in re.finditer(r'ERROR.*', content)]

sub：强大的替换功能

# 基本替换
re.sub(r'\s+', ' ', 'hello   world')
# → 'hello world'  （多个空白压缩为一个）

# 用捕获组反向引用重组
re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', 'Date: 2026-03-26')
# → 'Date: 03/26/2026'

# 限制替换次数
re.sub(r'\s', '-', 'a b c d', count=2)
# → 'a-b-c d'  （只替换前2个）

# 用函数作为替换器（动态替换）
def double_number(m):
    return str(int(m.group()) * 2)  # 将匹配到的数字乘以2

re.sub(r'\d+', double_number, 'a=5 b=10 c=15')
# → 'a=10 b=20 c=30'

# 函数替换：格式化货币
def format_currency(m):
    amount = float(m.group())
    return f'¥{amount:,.2f}'

re.sub(r'\d+(?:\.\d+)?', format_currency, 'total: 1234567.5')
# → 'total: ¥1,234,567.50'

# subn 返回替换次数
result, count = re.subn(r'\s+', ' ', 'a  b   c')
# result='a b c', count=2

split：灵活的分割

# 按空白分割（处理多余空格）
re.split(r'\s+', '  hello   world  ')
# → ['', 'hello', 'world', '']  （首尾空字符串）

# 过滤空字符串
[x for x in re.split(r'\s+', '  hello   world  ') if x]
# → ['hello', 'world']

# 按多种分隔符
re.split(r'[,;|]', 'a,b;c|d')
# → ['a', 'b', 'c', 'd']

# 带捕获组：分隔符也出现在结果中
re.split(r'(\s+)', 'hello world')
# → ['hello', ' ', 'world']  （空格作为分隔符也保留）

# 限制分割次数
re.split(r',', 'a,b,c,d', maxsplit=2)
# → ['a', 'b', 'c,d']  （最多分成3份）

compile 与性能优化

re.compile(pattern, flags=0)

预编译正则表达式，返回 Pattern 对象。Pattern 对象拥有与 re 模块相同的方法（match、search、findall 等），但省去了每次调用时重新解析模式的开销。re 模块内部有缓存（默认512个），多次用同一模式字符串调用 re.search() 会自动缓存，但显式 compile() 更明确。

# 场景：在循环中重复使用同一正则

# ❌ 低效：每次循环都解析模式字符串（虽然有缓存，但仍有查找开销）
for line in large_log_file:
    re.search(r'\d{4}-\d{2}-\d{2}', line)

# ✅ 高效：预编译，循环内直接使用
DATE_RE = re.compile(r'\d{4}-\d{2}-\d{2}')
for line in large_log_file:
    DATE_RE.search(line)

# Pattern 对象的方法与 re 模块完全相同
pattern = re.compile(r'(?P<key>\w+)=(?P<value>\S+)')

pattern.findall('x=1 y=2 z=3')
# → [('x', '1'), ('y', '2'), ('z', '3')]

pattern.sub(r'\g<key>: \g<value>', 'x=1 y=2')
# → 'x: 1 y: 2'

re.escape：处理用户输入

# 当用户输入包含特殊字符，需要当作字面字符串匹配时
user_input = '2.5 * (3+4) = 17.5'

# 直接使用用户输入作为正则是危险的（可能包含元字符）
# re.search(user_input, text)  ← 错误！

# 正确：转义所有特殊字符
escaped = re.escape(user_input)
# → '2\\.5\\ \\*\\ \\(3\\+4\\)\\ \\=\\ 17\\.5'

# 常见场景：高亮搜索关键词
def highlight_keyword(text, keyword):
    pattern = re.compile(re.escape(keyword), re.IGNORECASE)
    return pattern.sub(f'[{keyword}]', text)

highlight_keyword('I love C++ and C#', 'C++')
# → 'I love [C++] and C#'  （+ 被正确转义，不被解析为量词）

常见陷阱总结

陷阱1：match vs search

# match 只从开头
re.match(r'\d+', 'abc123')
# → None！

re.search(r'\d+', 'abc123')
# → '123' ✓

陷阱2：findall + 捕获组

# 有捕获组时返回值变了
re.findall(r'(\d+)', 'a1b2')
# → ['1', '2']（组内容）

# 用非捕获组保留全匹配
re.findall(r'(?:\d+)', 'a1b2')
# → ['1', '2'] ✓

陷阱3：忘记原始字符串

# \n 是换行，不是反斜杠n
re.search("\n", text)  # 匹配换行

# r"\n" 是两个字符
re.search(r"\n", text) # 也匹配换行（re 解析）

# 差别在复杂模式中体现
# 始终用 r"..." ！

陷阱4：贪婪过度匹配

# 贪婪导致匹配过多
re.findall(r'".*"', '"a" "b"')
# → ['"a" "b"']（整体）

# 懒惰或否定类解决
re.findall(r'"[^"]*"', '"a" "b"')
# → ['"a"', '"b"'] ✓

小结

本章要点

search 搜索整串；match 只从开头；fullmatch 要求整串完全匹配——验证格式用 fullmatch
Match 对象：group(0/1/2/name) 获取内容，groups() 获取所有组元组，groupdict() 获取命名组字典，span() 获取位置
findall：无组→整体，一个组→组内容，多个组→元组列表；finditer 大文件时更省内存
sub 支持字符串（\1 反向引用）、函数（动态替换）两种替换方式
compile() 预编译：循环中重复使用时显著提升性能；代码意图更清晰
re.escape() 转义用户输入，防止元字符被误解析——处理用户输入的必备工具

上一章标志位与多行模式下一章JavaScript RegExp 完全指南

Python re 模块完全指南