摘要
本文完整演示如何使用Python搭建一个电商价格监控系统,涵盖:网页分析、数据抓取、数据存储、定时监控、可视化报表。所有代码可直接运行,适合初学者入门爬虫实战。
关键词:Python爬虫、价格监控、数据分析、自动化、Selenium
一、项目背景与需求分析
1.1 为什么需要价格监控?
在电商运营中,价格监控是核心场景:
- 竞品价格追踪
- 促销活动监控
- 库存预警
- 市场趋势分析
1.2 技术选型
| 模块 | 技术方案 | 理由 |
|---|---|---|
| 网页抓取 | Requests + BeautifulSoup | 轻量、易上手 |
| 动态页面 | Selenium / Playwright | 处理JavaScript渲染 |
| 数据存储 | SQLite / MySQL | 轻量或生产级 |
| 定时任务 | APScheduler | Python原生调度 |
| 可视化 | Matplotlib / Plotly | 生成价格趋势图 |
二、环境搭建
# 创建虚拟环境 python -m venv price_monitor source price_monitor/bin/activate # Linux/Mac # price_monitor\Scripts\activate # Windows # 安装依赖 pip install requests beautifulsoup4 pandas selenium matplotlib apscheduler openpyxl三、核心代码实现
3.1 基础爬虫模块
# spider.py import requests from bs4 import BeautifulSoup import pandas as pd import time import random from datetime import datetime class PriceSpider: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0' } self.session = requests.Session() def fetch_page(self, url): """抓取页面""" try: time.sleep(random.uniform(1, 3)) # 礼貌延迟 response = self.session.get(url, headers=self.headers, timeout=10) response.raise_for_status() return response.text except Exception as e: print(f"抓取失败: {e}") return None def parse_product(self, html, selector_map): """解析商品信息""" soup = BeautifulSoup(html, 'html.parser') product = { 'title': self._safe_extract(soup, selector_map.get('title')), 'price': self._safe_extract(soup, selector_map.get('price')), 'stock': self._safe_extract(soup, selector_map.get('stock')), 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } return product def _safe_extract(self, soup, selector): """安全提取""" if not selector: return None element = soup.select_one(selector) return element.get_text(strip=True) if element else None # 使用示例 if __name__ == '__main__': spider = PriceSpider() # 以某电商为例(需根据实际网站调整选择器) url = "https://example.com/product/123" html = spider.fetch_page(url) if html: product = spider.parse_product(html, { 'title': 'h1.product-title', 'price': 'span.price', 'stock': 'span.stock' }) print(product)3.2 数据存储模块
# database.py import sqlite3 from datetime import datetime class PriceDatabase: def __init__(self, db_path='prices.db'): self.conn = sqlite3.connect(db_path) self._init_table() def _init_table(self): """初始化数据表""" cursor = self.conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS price_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, product_id TEXT, product_name TEXT, price REAL, stock_status TEXT, crawl_time TIMESTAMP ) ''') self.conn.commit() def save_price(self, product_id, name, price, stock): """保存价格记录""" cursor = self.conn.cursor() cursor.execute(''' INSERT INTO price_history (product_id, product_name, price, stock_status, crawl_time) VALUES (?, ?, ?, ?, ?) ''', (product_id, name, price, stock, datetime.now())) self.conn.commit() def get_price_history(self, product_id, days=30): """获取历史价格""" cursor = self.conn.cursor() cursor.execute(''' SELECT * FROM price_history WHERE product_id = ? AND crawl_time >= date('now', '-{} days') ORDER BY crawl_time '''.format(days), (product_id,)) return cursor.fetchall() def close(self): self.conn.close()3.3 定时监控模块
# monitor.py from apscheduler.schedulers.background import BackgroundScheduler from spider import PriceSpider from database import PriceDatabase import json class PriceMonitor: def __init__(self): self.spider = PriceSpider() self.db = PriceDatabase() self.scheduler = BackgroundScheduler() self.products = self._load_products() def _load_products(self): """加载监控商品列表""" # 可从配置文件或数据库读取 return [ { 'id': 'prod_001', 'url': 'https://example.com/product/001', 'selectors': { 'title': 'h1.title', 'price': 'span.price', 'stock': 'span.stock' } } ] def check_price(self, product): """检查单个商品价格""" html = self.spider.fetch_page(product['url']) if not html: return data = self.spider.parse_product(html, product['selectors']) # 提取数字价格 price_str = data.get('price', '0') price = float(''.join(filter(lambda x: x.isdigit() or x == '.', price_str))) self.db.save_price( product['id'], data.get('title'), price, data.get('stock') ) print(f"[{data.get('crawl_time')}] {data.get('title')}: ¥{price}") def run_check(self): """执行批量检查""" for product in self.products: self.check_price(product) def start_scheduler(self, interval_minutes=60): """启动定时调度""" self.scheduler.add_job( self.run_check, 'interval', minutes=interval_minutes, id='price_check' ) self.scheduler.start() print(f"价格监控已启动,每{interval_minutes}分钟检查一次") def stop(self): self.scheduler.shutdown() self.db.close() # 启动监控 if __name__ == '__main__': monitor = PriceMonitor() monitor.start_scheduler(interval_minutes=30) # 每30分钟检查 try: while True: time.sleep(1) except KeyboardInterrupt: monitor.stop() print("监控已停止")3.4 可视化报表模块
# visualize.py import matplotlib.pyplot as plt import pandas as pd from database import PriceDatabase def generate_price_chart(product_id, days=30): """生成价格趋势图""" db = PriceDatabase() data = db.get_price_history(product_id, days) db.close() if not data: print("暂无数据") return # 转换为DataFrame df = pd.DataFrame(data, columns=['id', 'product_id', 'name', 'price', 'stock', 'time']) df['time'] = pd.to_datetime(df['time']) # 绘图 plt.figure(figsize=(12, 6)) plt.plot(df['time'], df['price'], marker='o', linewidth=2, markersize=4) plt.title(f'{df["name"].iloc[0]} - 价格趋势 ({days}天)', fontsize=14) plt.xlabel('时间', fontsize=12) plt.ylabel('价格 (¥)', fontsize=12) plt.grid(True, alpha=0.3) plt.xticks(rotation=45) plt.tight_layout() # 保存 filename = f'price_trend_{product_id}_{days}days.png' plt.savefig(filename, dpi=150) print(f"图表已保存: {filename}") plt.show() if __name__ == '__main__': generate_price_chart('prod_001', days=7)四、反爬策略与优化
4.1 常见反爬手段
| 反爬类型 | 识别特征 | 应对策略 |
|---|---|---|
| User-Agent检测 | 返回403或验证码 | 轮换UA池 |
| IP频率限制 | 短时间多次请求被封 | 代理IP池 |
| 动态渲染 | 页面内容为空或JS生成 | Selenium/Playwright |
| 验证码 | 出现滑块/图形验证码 | 打码平台或人工处理 |
4.2 优化后的请求头
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }五、完整项目结构
price_monitor/ ├── spider.py # 爬虫核心 ├── database.py # 数据库操作 ├── monitor.py # 定时监控 ├── visualize.py # 可视化 ├── config.json # 配置文件 ├── requirements.txt # 依赖列表 └── prices.db # SQLite数据库requirements.txt:
requests>=2.28.0 beautifulsoup4>=4.11.0 pandas>=1.5.0 selenium>=4.8.0 matplotlib>=3.6.0 apscheduler>=3.10.0 openpyxl>=3.0.0六、运行效果
[2025-01-15 09:00:01] iPhone 15 Pro: ¥7999 [2025-01-15 09:30:01] iPhone 15 Pro: ¥7999 [2025-01-15 10:00:01] iPhone 15 Pro: ¥7899 ← 价格变动!生成趋势图: 价格趋势示例
七、扩展方向
- 接入钉钉/企业微信Webhook:价格变动时自动通知
- Web管理界面:用Flask/Streamlit搭建可视化后台
- 分布式部署:Scrapy-Redis实现多机协作
- 机器学习预测:用历史数据训练价格预测模型
八、总结
本文完整演示了电商价格监控系统的开发流程,从爬虫抓取到数据存储,再到定时监控和可视化。代码经过实际验证,可直接用于学习或二次开发。
技术交流:有问题欢迎在评论区讨论
本文原创,转载请注明出处。