第10章：实战文本解析与替换 — 正则表达式

实战一：解析 Nginx 访问日志

日志分析是正则的核心应用场景。Nginx 的默认日志格式（Combined Log Format）有固定结构，非常适合正则解析：

import re
from collections import Counter, defaultdict
from datetime import datetime

# Nginx 默认日志行格式：
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /index.html HTTP/1.0" 200 2326

LOG_PATTERN = re.compile(r"""
    (?P<ip>\d+\.\d+\.\d+\.\d+)   # 客户端 IP
    \s+-\s+                         # identd（通常 -）
    (?P<user>\S+)                  # 认证用户（通常 -）
    \s+\[(?P<time>[^\]]+)\]\s+    # 时间戳
    "(?P<method>\w+)\s+            # HTTP 方法
    (?P<path>\S+)\s+               # 请求路径
    (?P<proto>HTTP/[0-9.]+)"\s+   # 协议版本
    (?P<status>\d{3})\s+           # 状态码
    (?P<size>\d+|-)                # 响应大小（- 表示0）
    (?:\s+"(?P<referer>[^"]*)")?  # Referer（可选）
    (?:\s+"(?P<ua>[^"]*)")?       # User-Agent（可选）
""", re.VERBOSE)

def analyze_log(log_file_path):
    status_counter = Counter()
    ip_counter = Counter()
    error_urls = []

    with open(log_file_path, encoding='utf-8', errors='ignore') as f:
        for line_no, line in enumerate(f, 1):
            m = LOG_PATTERN.match(line.strip())
            if not m:
                continue
            d = m.groupdict()
            status_counter[d['status']] += 1
            ip_counter[d['ip']] += 1

            # 收集 4xx/5xx 错误请求
            if d['status'].startswith(('4', '5')):
                error_urls.append((d['status'], d['path']))

    return {
        'status_dist': dict(status_counter),
        'top10_ips': ip_counter.most_common(10),
        'error_urls': error_urls[:20]
    }

实战二：提取并清理 HTML

从 HTML 中提取数据是正则的高频应用，但要注意正则只适合处理结构简单、已知格式的 HTML：

import re

def extract_article_data(html: str) -> dict:
    # 提取标题
    title_m = re.search(r'<title>([^<]+)</title>', html, re.IGNORECASE)
    title = title_m.group(1).strip() if title_m else ''

    # 提取 meta description
    desc_m = re.search(
        r'<meta[^>]+name=["\']description["\'][^>]+content=["\']([^"\']+)["\']',
        html, re.IGNORECASE
    )
    description = desc_m.group(1) if desc_m else ''

    # 提取所有图片（src 和 alt）
    images = re.findall(
        r'<img[^>]+src=["\']([^"\']+)["\'][^>]*(?:alt=["\']([^"\']*)["\'])?',
        html, re.IGNORECASE
    )

    # 提取所有链接
    links = re.findall(r'<a[^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE)
    external_links = [l for l in links if l.startswith('http')]

    # 提取纯文本
    text = re.sub(r'<(?:script|style)[^>]*>.*?</(?:script|style)>',
                  '', html, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'&(?:amp|lt|gt|quot|nbsp);', lambda m: {
        '&amp;': '&', '&lt;': '<', '&gt;': '>',
        '&quot;': '"', '&nbsp;': ' '
    }.get(m.group(), m.group()), text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()

    return {'title': title, 'description': description,
            'images': images, 'external_links': external_links,
            'text_preview': text[:200]}

实战三：处理 CSV 数据

# 标准库 csv 模块更健壮，但正则理解其原理很有帮助

# CSV 字段解析（处理引号内含逗号、引号转义）
CSV_FIELD = re.compile(r'"(?:[^"]|"")*"|[^,]*')

def parse_csv_line(line: str) -> list:
    """解析单行 CSV，支持引号内含逗号和 "" 转义引号"""
    fields = CSV_FIELD.findall(line)
    result = []
    for f in fields:
        if f.startswith('"'):
            # 去掉外层引号，将 "" 替换为 "
            f = f[1:-1].replace('""', '"')
        result.append(f)
    return result

# 测试
parse_csv_line('name,"Doe, John",30,"says ""hello"""')
# → ['name', 'Doe, John', '30', 'says "hello"']

# 验证 CSV 数字列
def clean_amount(s):
    """从 "$1,234.56" 或 "1234.56" 中提取数字"""
    m = re.search(r'[\d,]+(?:\.\d+)?', s)
    if not m:
        return None
    return float(m.group().replace(',', ''))

实战四：代码批量重构

正则是代码迁移和批量重构的有力工具：

# 案例1：Python 2 → Python 3 print 语句迁移
def fix_print_statements(code: str) -> str:
    return re.sub(
        r'^(\s*)print\s+(.+)$',  # 匹配 print 语句（不是函数调用）
        r'\1print(\2)',          # 转换为函数调用
        code,
        flags=re.MULTILINE
    )

# 案例2：驼峰命名 → 下划线命名
def camel_to_snake(name: str) -> str:
    # 步骤1：处理连续大写（如 HTTPRequest → HTTP_Request）
    s1 = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)
    # 步骤2：处理小写后跟大写（如 Request → _Request）
    s2 = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', s1)
    return s2.lower()

camel_to_snake('getUserName')         # → 'get_user_name'
camel_to_snake('parseHTTPRequest')    # → 'parse_http_request'
camel_to_snake('loadHTMLPage')        # → 'load_html_page'

# 案例3：下划线命名 → 驼峰命名
def snake_to_camel(name: str) -> str:
    return re.sub(r'_([a-z])', lambda m: m.group(1).upper(), name)

snake_to_camel('get_user_name')  # → 'getUserName'

# 案例4：批量替换 API 方法名
def migrate_api(code: str) -> str:
    migrations = [
        (r'\bgetUsers\(\)', 'fetchUsers()'),            # 方法重命名
        (r'\bcreateUser\((\w+)\)', r'users.create(\1)'),  # 命名空间迁移
        (r'callback\s*=\s*function\(', 'callback = ('),   # 箭头函数
    ]
    for pattern, replacement in migrations:
        code = re.sub(pattern, replacement, code)
    return code

实战五：配置文件解析

# 解析 .env 文件（KEY=VALUE 格式）
ENV_LINE = re.compile(r"""
    ^
    (?!\#)                     # 不是注释行（负向前瞻）
    \s*
    (?P<key>[A-Z_][A-Z0-9_]*)  # 键名：大写字母/数字/下划线
    \s*=\s*
    (?P<value>
        "[^"]*"          |     # 双引号值
        '[^']*'          |     # 单引号值
        [^\s#]+                # 无引号值（到空格或注释为止）
    )
    (?:\s*\#.*)?               # 可选的行尾注释
    $
""", re.VERBOSE | re.MULTILINE)

def parse_env_file(content: str) -> dict:
    result = {}
    for m in ENV_LINE.finditer(content):
        key = m.group('key')
        value = m.group('value').strip()
        # 去除外层引号
        if (value.startswith('"') and value.endswith('"')) or \
           (value.startswith("'") and value.endswith("'")):
            value = value[1:-1]
        result[key] = value
    return result

# 测试
env_content = """
# 数据库配置
DB_HOST=localhost
DB_PORT=5432
DB_NAME="my_database"
SECRET_KEY='abc123def456'
DEBUG=true  # 调试模式
"""
parse_env_file(env_content)
# → {'DB_HOST': 'localhost', 'DB_PORT': '5432',
#    'DB_NAME': 'my_database', 'SECRET_KEY': 'abc123def456', 'DEBUG': 'true'}

实战六：模板引擎

# 简单模板引擎：支持变量替换 {{variable}} 和条件注释
def render_template(template: str, context: dict) -> str:
    # 替换 {{ variable }} 形式的变量
    def replace_var(m):
        key = m.group(1).strip()
        value = context.get(key)
        if value is None:
            return m.group(0)  # 未找到变量保持原样
        return str(value)

    result = re.sub(r'\{\{\s*(\w+(?:\.\w+)*)\s*\}\}', replace_var, template)

    # 替换 {# comment #} 形式的注释（移除）
    result = re.sub(r'\{#.*?#\}', '', result, flags=re.DOTALL)

    # 清理多余空行
    result = re.sub(r'\n{3,}', '\n\n', result)

    return result.strip()

# 测试
template = """
Dear {{ name }},
{# 这是注释，不会出现在输出中 #}
You have {{ count }} unread messages.
Visit {{ base_url }}/inbox to read them.
"""

render_template(template, {
    'name': 'Alice',
    'count': 5,
    'base_url': 'https://example.com'
})

性能优化总结

预编译正则（re.compile）

在循环外预编译，循环内复用 Pattern 对象。虽然 re 模块有内部缓存，但显式编译更高效，也让代码意图更清晰。规则：任何在函数外定义、会被多次调用的正则都应预编译。

否定字符类代替懒惰量词

在分隔符已知时，用 [^X]* 代替 .*?。前者不需要回溯（一次扫描到分隔符），后者需要逐步扩展。对于长字符串，性能差异可达 2-10 倍。

避免嵌套量词

(a+)+、(\w+\s*)+ 等嵌套量词在某些输入下触发指数级回溯。改写思路：消除模糊边界，明确每个字符只被一个量词处理。

使用 fullmatch 代替 ^ $

验证格式时，re.fullmatch(r'\d+', s) 比 re.match(r'^\d+$', s) 稍快且更清晰，因为不需要在模式中显式写锚点。

# 性能基准示例
import re, timeit

# 场景：从1000行文本中提取引号内的字符串
text = 'a "hello world" b\n' * 1000

# 方式1：懒惰量词（有回溯）
p1 = re.compile(r'".*?"')

# 方式2：否定字符类（无回溯）
p2 = re.compile(r'"[^"]*"')

# 实际测试（p2 通常快 1.5-3 倍）
t1 = timeit.timeit(lambda: p1.findall(text), number=1000)
t2 = timeit.timeit(lambda: p2.findall(text), number=1000)
print(ff"懒惰量词: {t1:.3f}s, 否定类: {t2:.3f}s")

调试工具推荐

工具	用途	特点
regex101.com	在线调试首选	实时高亮、解释说明、支持 Python/JS/PHP/Go 方言
regexr.com	可视化解释	悬停查看每个元素的含义，教学友好
pythex.org	Python 专用	直接测试 Python re 模块的行为
re.DEBUG 标志	打印内部结构	显示正则的内部解析树，帮助理解引擎行为
safe-regex	检测 ReDoS	npm 包，静态分析正则是否存在灾难性回溯风险

# re.DEBUG 示例
import re
re.compile(r'(\d+)-(\w+)', re.DEBUG)
# 输出（简化版）：
# SUBPATTERN 1 0 0     ← 捕获组1
#   MAX_REPEAT 1 MAXREPEAT  ← + 量词
#     IN                   ← 字符类
#       RANGE (48, 57)      ← 0-9（ASCII 48-57）
# LITERAL 45            ← '-' 字面连字符
# SUBPATTERN 2 0 0     ← 捕获组2
#   MAX_REPEAT 1 MAXREPEAT
#     IN
#       RANGE (65, 90)  ← A-Z
#       RANGE (97, 122) ← a-z
#       RANGE (48, 57)  ← 0-9
#       LITERAL 95      ← '_'

课程总结

恭喜完成正则表达式完全指南！你现在系统掌握了：

语法基础（第1-6章）

字面字符、元字符、转义规则
量词 * + ? {n,m}
字符类 [...] 和预定义类 \d \w \s
捕获组、命名组、非捕获组、反向引用
零宽断言：四种前瞻/后顾
贪婪/懒惰/占有性量词与回溯机制
标志位：i、m、s、x 及其精确含义

工程实践（第7-10章）

Python re 模块完整 API
JavaScript RegExp 及 ES2018+ 新特性
常用模式库（邮箱、URL、手机号等）
ReDoS 防御与性能优化
日志分析、HTML 提取、CSV 解析
代码批量重构（驼峰/下划线转换）
配置文件解析、模板引擎实现

下一步建议正则表达式最好的学习方法是"用中学"：① 打开 regex101.com，把你工作中遇到的文本处理问题用正则解决；② 阅读开源项目中的正则表达式（如知名库的 URL 解析、日志处理代码），理解他人的设计思路；③ 记住：正则不是银弹——遇到复杂结构（HTML/JSON/XML）优先用专门的解析器，正则负责处理"线性文本模式"。

上一章常用正则模式库返回目录正则表达式完全指南

实战：文本解析与替换

实战一：解析 Nginx 访问日志

实战二：提取并清理 HTML

实战三：处理 CSV 数据

实战四：代码批量重构

实战五：配置文件解析

实战六：模板引擎

性能优化总结

调试工具推荐

课程总结

语法基础（第1-6章）

工程实践（第7-10章）