Python爬虫实战：从0到1搭建电商价格监控系统（附完整源码）-港品优选

摘要

本文完整演示如何使用Python搭建一个电商价格监控系统，涵盖：网页分析、数据抓取、数据存储、定时监控、可视化报表。所有代码可直接运行，适合初学者入门爬虫实战。

关键词：Python爬虫、价格监控、数据分析、自动化、Selenium

一、项目背景与需求分析

1.1 为什么需要价格监控？

在电商运营中，价格监控是核心场景：

竞品价格追踪
促销活动监控
库存预警
市场趋势分析

1.2 技术选型

模块	技术方案	理由
网页抓取	Requests + BeautifulSoup	轻量、易上手
动态页面	Selenium / Playwright	处理JavaScript渲染
数据存储	SQLite / MySQL	轻量或生产级
定时任务	APScheduler	Python原生调度
可视化	Matplotlib / Plotly	生成价格趋势图

二、环境搭建

# 创建虚拟环境 python -m venv price_monitor source price_monitor/bin/activate # Linux/Mac # price_monitor\Scripts\activate # Windows # 安装依赖 pip install requests beautifulsoup4 pandas selenium matplotlib apscheduler openpyxl

三、核心代码实现

3.1 基础爬虫模块

# spider.py import requests from bs4 import BeautifulSoup import pandas as pd import time import random from datetime import datetime class PriceSpider: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.0' } self.session = requests.Session() def fetch_page(self, url): """抓取页面""" try: time.sleep(random.uniform(1, 3)) # 礼貌延迟 response = self.session.get(url, headers=self.headers, timeout=10) response.raise_for_status() return response.text except Exception as e: print(f"抓取失败: {e}") return None def parse_product(self, html, selector_map): """解析商品信息""" soup = BeautifulSoup(html, 'html.parser') product = { 'title': self._safe_extract(soup, selector_map.get('title')), 'price': self._safe_extract(soup, selector_map.get('price')), 'stock': self._safe_extract(soup, selector_map.get('stock')), 'crawl_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } return product def _safe_extract(self, soup, selector): """安全提取""" if not selector: return None element = soup.select_one(selector) return element.get_text(strip=True) if element else None # 使用示例 if __name__ == '__main__': spider = PriceSpider() # 以某电商为例（需根据实际网站调整选择器） url = "https://example.com/product/123" html = spider.fetch_page(url) if html: product = spider.parse_product(html, { 'title': 'h1.product-title', 'price': 'span.price', 'stock': 'span.stock' }) print(product)

3.2 数据存储模块

# database.py import sqlite3 from datetime import datetime class PriceDatabase: def __init__(self, db_path='prices.db'): self.conn = sqlite3.connect(db_path) self._init_table() def _init_table(self): """初始化数据表""" cursor = self.conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS price_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, product_id TEXT, product_name TEXT, price REAL, stock_status TEXT, crawl_time TIMESTAMP ) ''') self.conn.commit() def save_price(self, product_id, name, price, stock): """保存价格记录""" cursor = self.conn.cursor() cursor.execute(''' INSERT INTO price_history (product_id, product_name, price, stock_status, crawl_time) VALUES (?, ?, ?, ?, ?) ''', (product_id, name, price, stock, datetime.now())) self.conn.commit() def get_price_history(self, product_id, days=30): """获取历史价格""" cursor = self.conn.cursor() cursor.execute(''' SELECT * FROM price_history WHERE product_id = ? AND crawl_time >= date('now', '-{} days') ORDER BY crawl_time '''.format(days), (product_id,)) return cursor.fetchall() def close(self): self.conn.close()

3.3 定时监控模块

# monitor.py from apscheduler.schedulers.background import BackgroundScheduler from spider import PriceSpider from database import PriceDatabase import json class PriceMonitor: def __init__(self): self.spider = PriceSpider() self.db = PriceDatabase() self.scheduler = BackgroundScheduler() self.products = self._load_products() def _load_products(self): """加载监控商品列表""" # 可从配置文件或数据库读取 return [ { 'id': 'prod_001', 'url': 'https://example.com/product/001', 'selectors': { 'title': 'h1.title', 'price': 'span.price', 'stock': 'span.stock' } } ] def check_price(self, product): """检查单个商品价格""" html = self.spider.fetch_page(product['url']) if not html: return data = self.spider.parse_product(html, product['selectors']) # 提取数字价格 price_str = data.get('price', '0') price = float(''.join(filter(lambda x: x.isdigit() or x == '.', price_str))) self.db.save_price( product['id'], data.get('title'), price, data.get('stock') ) print(f"[{data.get('crawl_time')}] {data.get('title')}: ¥{price}") def run_check(self): """执行批量检查""" for product in self.products: self.check_price(product) def start_scheduler(self, interval_minutes=60): """启动定时调度""" self.scheduler.add_job( self.run_check, 'interval', minutes=interval_minutes, id='price_check' ) self.scheduler.start() print(f"价格监控已启动，每{interval_minutes}分钟检查一次") def stop(self): self.scheduler.shutdown() self.db.close() # 启动监控 if __name__ == '__main__': monitor = PriceMonitor() monitor.start_scheduler(interval_minutes=30) # 每30分钟检查 try: while True: time.sleep(1) except KeyboardInterrupt: monitor.stop() print("监控已停止")

3.4 可视化报表模块

# visualize.py import matplotlib.pyplot as plt import pandas as pd from database import PriceDatabase def generate_price_chart(product_id, days=30): """生成价格趋势图""" db = PriceDatabase() data = db.get_price_history(product_id, days) db.close() if not data: print("暂无数据") return # 转换为DataFrame df = pd.DataFrame(data, columns=['id', 'product_id', 'name', 'price', 'stock', 'time']) df['time'] = pd.to_datetime(df['time']) # 绘图 plt.figure(figsize=(12, 6)) plt.plot(df['time'], df['price'], marker='o', linewidth=2, markersize=4) plt.title(f'{df["name"].iloc[0]} - 价格趋势 ({days}天)', fontsize=14) plt.xlabel('时间', fontsize=12) plt.ylabel('价格 (¥)', fontsize=12) plt.grid(True, alpha=0.3) plt.xticks(rotation=45) plt.tight_layout() # 保存 filename = f'price_trend_{product_id}_{days}days.png' plt.savefig(filename, dpi=150) print(f"图表已保存: {filename}") plt.show() if __name__ == '__main__': generate_price_chart('prod_001', days=7)

四、反爬策略与优化

4.1 常见反爬手段

反爬类型	识别特征	应对策略
User-Agent检测	返回403或验证码	轮换UA池
IP频率限制	短时间多次请求被封	代理IP池
动态渲染	页面内容为空或JS生成	Selenium/Playwright
验证码	出现滑块/图形验证码	打码平台或人工处理

4.2 优化后的请求头

headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }

五、完整项目结构

price_monitor/ ├── spider.py # 爬虫核心 ├── database.py # 数据库操作 ├── monitor.py # 定时监控 ├── visualize.py # 可视化 ├── config.json # 配置文件 ├── requirements.txt # 依赖列表 └── prices.db # SQLite数据库

requirements.txt：

requests>=2.28.0 beautifulsoup4>=4.11.0 pandas>=1.5.0 selenium>=4.8.0 matplotlib>=3.6.0 apscheduler>=3.10.0 openpyxl>=3.0.0

六、运行效果

[2025-01-15 09:00:01] iPhone 15 Pro: ¥7999 [2025-01-15 09:30:01] iPhone 15 Pro: ¥7999 [2025-01-15 10:00:01] iPhone 15 Pro: ¥7899 ← 价格变动！

生成趋势图：价格趋势示例

七、扩展方向

接入钉钉/企业微信Webhook：价格变动时自动通知
Web管理界面：用Flask/Streamlit搭建可视化后台
分布式部署：Scrapy-Redis实现多机协作
机器学习预测：用历史数据训练价格预测模型

八、总结

本文完整演示了电商价格监控系统的开发流程，从爬虫抓取到数据存储，再到定时监控和可视化。代码经过实际验证，可直接用于学习或二次开发。

技术交流：有问题欢迎在评论区讨论

本文原创，转载请注明出处。

企业官网建设流程全解析

摘要

一、项目背景与需求分析

1.1 为什么需要价格监控？

1.2 技术选型

二、环境搭建

三、核心代码实现

3.1 基础爬虫模块

3.2 数据存储模块

3.3 定时监控模块

3.4 可视化报表模块

四、反爬策略与优化

4.1 常见反爬手段

4.2 优化后的请求头

五、完整项目结构

六、运行效果

七、扩展方向

八、总结

热门文章

文章分类

标签云

需要专业的网站建设服务？

企业官网建设流程全解析

摘要

一、项目背景与需求分析

1.1 为什么需要价格监控？

1.2 技术选型

二、环境搭建

三、核心代码实现

3.1 基础爬虫模块

3.2 数据存储模块

3.3 定时监控模块

3.4 可视化报表模块

四、反爬策略与优化

4.1 常见反爬手段

4.2 优化后的请求头

五、完整项目结构

六、运行效果

七、扩展方向

八、总结

热门文章

文章分类

标签云

相关文章

NS-USBLoader：Switch游戏管理终极解决方案，5分钟快速上手指南

OpencvSharp 算子学习教案之 - Cv2.FitLine 重载1

3分钟搞定！FigmaCN插件让你彻底告别英文界面困扰

需要专业的网站建设服务？