#!/usr/bin/env python3 """ 闲鱼商品搜索模块 基于 Playwright 实现真实的闲鱼商品搜索功能 """ import asyncio import json import time import sys import os from datetime import datetime from typing import Dict, List, Any, Optional from loguru import logger # 修复Docker环境中的asyncio事件循环策略问题 if sys.platform.startswith('linux') or os.getenv('DOCKER_ENV'): try: # 在Linux/Docker环境中设置事件循环策略 asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) except Exception as e: logger.warning(f"设置事件循环策略失败: {e}") # 确保在Docker环境中使用正确的事件循环 if os.getenv('DOCKER_ENV'): try: # 强制使用SelectorEventLoop(在Docker中更稳定) if hasattr(asyncio, 'SelectorEventLoop'): loop = asyncio.SelectorEventLoop() asyncio.set_event_loop(loop) except Exception as e: logger.warning(f"设置SelectorEventLoop失败: {e}") try: from playwright.async_api import async_playwright PLAYWRIGHT_AVAILABLE = True except ImportError: PLAYWRIGHT_AVAILABLE = False logger.warning("Playwright 未安装,将使用模拟数据") class XianyuSearcher: """闲鱼商品搜索器 - 基于 Playwright""" def __init__(self): self.browser = None self.context = None self.page = None self.api_responses = [] async def safe_get(self, data, *keys, default="暂无"): """安全获取嵌套字典值""" for key in keys: try: data = data[key] except (KeyError, TypeError, IndexError): return default return data async def test_browser_launch(self): """测试浏览器是否能正常启动""" try: if not PLAYWRIGHT_AVAILABLE: return False, "Playwright 未安装" playwright = await async_playwright().start() browser = await playwright.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() await page.goto("https://www.baidu.com") await asyncio.sleep(2) await browser.close() return True, "浏览器测试成功" except Exception as e: return False, f"浏览器测试失败: {str(e)}" async def get_first_valid_cookie(self): """获取第一个有效的cookie""" try: from db_manager import db_manager # 获取所有cookies,返回格式是 {id: value} cookies = db_manager.get_all_cookies() # 找到第一个有效的cookie(长度大于50的认为是有效的) for cookie_id, cookie_value in cookies.items(): if len(cookie_value) > 50: logger.info(f"找到有效cookie: {cookie_id}") return { 'id': cookie_id, 'value': cookie_value } return None except Exception as e: logger.error(f"获取cookie失败: {str(e)}") return None async def set_browser_cookies(self, cookie_value: str): """设置浏览器cookies""" try: if not cookie_value: return False # 解析cookie字符串 cookies = [] for cookie_pair in cookie_value.split(';'): cookie_pair = cookie_pair.strip() if '=' in cookie_pair: name, value = cookie_pair.split('=', 1) cookies.append({ 'name': name.strip(), 'value': value.strip(), 'domain': '.goofish.com', 'path': '/' }) # 设置cookies到浏览器 await self.context.add_cookies(cookies) logger.info(f"成功设置 {len(cookies)} 个cookies到浏览器") return True except Exception as e: logger.error(f"设置浏览器cookies失败: {str(e)}") return False async def init_browser(self): """初始化浏览器""" if not PLAYWRIGHT_AVAILABLE: raise Exception("Playwright 未安装,无法使用真实搜索功能") if not self.browser: playwright = await async_playwright().start() logger.info("正在启动浏览器...") # 简化的浏览器启动参数,避免冲突 browser_args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--no-first-run', '--disable-extensions', '--disable-default-apps', '--no-default-browser-check' ] # 只在确实是Docker环境时添加额外参数 if os.getenv('DOCKER_ENV') == 'true': browser_args.extend([ '--disable-gpu', '--single-process' ]) logger.info("正在启动浏览器...") self.browser = await playwright.chromium.launch( headless=True, # 无头模式,后台运行 args=browser_args ) logger.info("浏览器启动成功,创建上下文...") # 简化上下文创建,减少可能的问题 self.context = await self.browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={'width': 1280, 'height': 720} ) logger.info("创建页面...") self.page = await self.context.new_page() logger.info("浏览器初始化完成") async def close_browser(self): """关闭浏览器""" if self.browser: await self.browser.close() self.browser = None self.context = None self.page = None async def search_items(self, keyword: str, page: int = 1, page_size: int = 20) -> Dict[str, Any]: """ 搜索闲鱼商品 - 使用 Playwright 获取真实数据 Args: keyword: 搜索关键词 page: 页码,从1开始 page_size: 每页数量 Returns: 搜索结果字典,包含items列表和总数 """ try: if not PLAYWRIGHT_AVAILABLE: logger.error("Playwright 不可用,无法获取真实数据") return { 'items': [], 'total': 0, 'error': 'Playwright 不可用,无法获取真实数据' } logger.info(f"使用 Playwright 搜索闲鱼商品: 关键词='{keyword}', 页码={page}, 每页={page_size}") await self.init_browser() # 清空之前的API响应 self.api_responses = [] data_list = [] # 设置API响应监听器 async def on_response(response): """处理API响应,解析数据""" if "h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search" in response.url: try: # 检查响应状态 if response.status != 200: logger.warning(f"API响应状态异常: {response.status}") return # 安全地获取响应内容 try: result_json = await response.json() except Exception as json_error: logger.warning(f"无法解析响应JSON: {str(json_error)}") return self.api_responses.append(result_json) logger.info(f"捕获到API响应,URL: {response.url}") items = result_json.get("data", {}).get("resultList", []) logger.info(f"从API获取到 {len(items)} 条原始数据") for item in items: try: parsed_item = await self._parse_real_item(item) if parsed_item: data_list.append(parsed_item) except Exception as parse_error: logger.warning(f"解析单个商品失败: {str(parse_error)}") continue except Exception as e: logger.warning(f"响应处理异常: {str(e)}") try: # 获取并设置cookies进行登录 logger.info("正在获取有效的cookies账户...") cookie_data = await self.get_first_valid_cookie() if not cookie_data: raise Exception("未找到有效的cookies账户,请先在Cookie管理中添加有效的闲鱼账户") logger.info(f"使用账户: {cookie_data.get('id', 'unknown')}") logger.info("正在访问闲鱼首页...") await self.page.goto("https://www.goofish.com", timeout=30000) # 设置cookies进行登录 logger.info("正在设置cookies进行登录...") cookie_success = await self.set_browser_cookies(cookie_data.get('value', '')) if not cookie_success: logger.warning("设置cookies失败,将以未登录状态继续") else: logger.info("✅ cookies设置成功,已登录") # 刷新页面以应用cookies await self.page.reload() await asyncio.sleep(2) await self.page.wait_for_load_state("networkidle", timeout=10000) logger.info(f"正在搜索关键词: {keyword}") await self.page.fill('input[class*="search-input"]', keyword) # 注册响应监听 self.page.on("response", on_response) await self.page.click('button[type="submit"]') await self.page.wait_for_load_state("networkidle", timeout=15000) # 等待第一页API响应 logger.info("等待第一页API响应...") await asyncio.sleep(5) # 尝试处理弹窗 try: await self.page.keyboard.press('Escape') await asyncio.sleep(1) except: pass # 等待更多数据 await asyncio.sleep(3) first_page_count = len(data_list) logger.info(f"第1页完成,获取到 {first_page_count} 条数据") # 如果需要获取指定页数据,实现翻页逻辑 if page > 1: # 清空之前的数据,只保留目标页的数据 data_list.clear() await self._navigate_to_page(page) # 根据"人想要"数量进行倒序排列 data_list.sort(key=lambda x: x.get('want_count', 0), reverse=True) total_count = len(data_list) logger.info(f"搜索完成,总共获取到 {total_count} 条真实数据,已按想要人数排序") return { 'items': data_list, 'total': total_count, 'is_real_data': True, 'source': 'playwright' } finally: await self.close_browser() except Exception as e: error_msg = str(e) logger.error(f"Playwright 搜索失败: {error_msg}") # 检查是否是浏览器安装问题 if "Executable doesn't exist" in error_msg or "playwright install" in error_msg: error_msg = "浏览器未安装。请在Docker容器中运行: playwright install chromium" elif "BrowserType.launch" in error_msg: error_msg = "浏览器启动失败。请确保Docker容器有足够的权限和资源" # 如果 Playwright 失败,返回错误信息 return { 'items': [], 'total': 0, 'error': f'搜索失败: {error_msg}' } async def _get_fallback_data(self, keyword: str, page: int, page_size: int) -> Dict[str, Any]: """获取备选数据(模拟数据)""" logger.info(f"使用备选数据: 关键词='{keyword}', 页码={page}, 每页={page_size}") # 模拟搜索延迟 await asyncio.sleep(0.5) # 生成模拟数据 mock_items = [] start_index = (page - 1) * page_size for i in range(page_size): item_index = start_index + i + 1 mock_items.append({ 'item_id': f'mock_{keyword}_{item_index}', 'title': f'{keyword}相关商品 #{item_index} [模拟数据]', 'price': f'{100 + item_index * 10}', 'seller_name': f'卖家{item_index}', 'item_url': f'https://www.goofish.com/item?id=mock_{keyword}_{item_index}', 'publish_time': '2025-07-28', 'tags': [f'标签{i+1}', f'分类{i+1}'], 'main_image': f'https://via.placeholder.com/200x200?text={keyword}商品{item_index}', 'raw_data': { 'mock': True, 'keyword': keyword, 'index': item_index } }) # 模拟总数 total_items = 100 + hash(keyword) % 500 logger.info(f"备选数据生成完成: 找到{len(mock_items)}个商品,总计{total_items}个") return { 'items': mock_items, 'total': total_items, 'is_fallback': True } async def _parse_real_item(self, item_data: Dict[str, Any]) -> Optional[Dict[str, Any]]: """解析真实的闲鱼商品数据""" try: main_data = await self.safe_get(item_data, "data", "item", "main", "exContent", default={}) click_params = await self.safe_get(item_data, "data", "item", "main", "clickParam", "args", default={}) # 解析商品信息 title = await self.safe_get(main_data, "title", default="未知标题") # 价格处理 price_parts = await self.safe_get(main_data, "price", default=[]) price = "价格异常" if isinstance(price_parts, list): price = "".join([str(p.get("text", "")) for p in price_parts if isinstance(p, dict)]) price = price.replace("当前价", "").strip() # 统一价格格式处理 if price and price != "价格异常": # 先移除所有¥符号,避免重复 clean_price = price.replace('¥', '').strip() # 处理万单位的价格 if "万" in clean_price: try: numeric_price = clean_price.replace('万', '').strip() price_value = float(numeric_price) * 10000 price = f"¥{price_value:.0f}" except: price = f"¥{clean_price}" # 如果转换失败,保持原样但确保有¥符号 else: # 普通价格,确保有¥符号 if clean_price and (clean_price[0].isdigit() or clean_price.replace('.', '').isdigit()): price = f"¥{clean_price}" else: price = clean_price if clean_price else "价格异常" # 只提取"想要人数"标签 fish_tags_content = "" fish_tags = await self.safe_get(main_data, "fishTags", default={}) # 遍历所有类型的标签 (r2, r3, r4等) for tag_type, tag_data in fish_tags.items(): if isinstance(tag_data, dict) and "tagList" in tag_data: tag_list = tag_data.get("tagList", []) for tag_item in tag_list: if isinstance(tag_item, dict) and "data" in tag_item: content = tag_item["data"].get("content", "") # 只保留包含"人想要"的标签 if content and "人想要" in content: fish_tags_content = content break if fish_tags_content: # 找到后就退出 break # 其他字段解析 area = await self.safe_get(main_data, "area", default="地区未知") seller = await self.safe_get(main_data, "userNickName", default="匿名卖家") raw_link = await self.safe_get(item_data, "data", "item", "main", "targetUrl", default="") image_url = await self.safe_get(main_data, "picUrl", default="") # 获取商品ID item_id = await self.safe_get(click_params, "item_id", default="未知ID") # 处理发布时间 publish_time = "未知时间" publish_timestamp = click_params.get("publishTime", "") if publish_timestamp and publish_timestamp.isdigit(): try: publish_time = datetime.fromtimestamp( int(publish_timestamp)/1000 ).strftime("%Y-%m-%d %H:%M") except: pass # 提取"人想要"的数字用于排序 want_count = self._extract_want_count(fish_tags_content) return { "item_id": item_id, "title": title, "price": price, "seller_name": seller, "item_url": raw_link.replace("fleamarket://", "https://www.goofish.com/"), "main_image": f"https:{image_url}" if image_url and not image_url.startswith("http") else image_url, "publish_time": publish_time, "tags": [fish_tags_content] if fish_tags_content else [], "area": area, "want_count": want_count, # 添加想要人数用于排序 "raw_data": item_data } except Exception as e: logger.warning(f"解析真实商品数据失败: {str(e)}") return None def _extract_want_count(self, tags_content: str) -> int: """从标签内容中提取"人想要"的数字""" try: if not tags_content or "人想要" not in tags_content: return 0 # 使用正则表达式提取数字 import re # 匹配类似 "123人想要" 或 "1.2万人想要" 的格式 pattern = r'(\d+(?:\.\d+)?(?:万)?)\s*人想要' match = re.search(pattern, tags_content) if match: number_str = match.group(1) if '万' in number_str: # 处理万单位 number = float(number_str.replace('万', '')) * 10000 return int(number) else: return int(float(number_str)) return 0 except Exception as e: logger.warning(f"提取想要人数失败: {str(e)}") return 0 async def _navigate_to_page(self, target_page: int): """导航到指定页面""" try: logger.info(f"正在导航到第 {target_page} 页...") # 等待页面稳定 await asyncio.sleep(2) # 查找并点击下一页按钮 next_button_selectors = [ '.search-page-tiny-arrow-right--oXVFaRao', # 用户找到的正确选择器 '[class*="search-page-tiny-arrow-right"]', # 更通用的版本 'button[aria-label="下一页"]', 'button:has-text("下一页")', 'a:has-text("下一页")', '.ant-pagination-next', 'li.ant-pagination-next a', 'a[aria-label="下一页"]', '[class*="next"]', '[class*="pagination-next"]', 'button[title="下一页"]', 'a[title="下一页"]' ] # 从第2页开始点击 for current_page in range(2, target_page + 1): logger.info(f"正在点击到第 {current_page} 页...") next_button_found = False for selector in next_button_selectors: try: next_button = self.page.locator(selector).first if await next_button.is_visible(timeout=3000): # 检查按钮是否可点击(不是禁用状态) is_disabled = await next_button.get_attribute("disabled") has_disabled_class = await next_button.evaluate("el => el.classList.contains('ant-pagination-disabled') || el.classList.contains('disabled')") if not is_disabled and not has_disabled_class: logger.info(f"找到下一页按钮,正在点击...") # 滚动到按钮位置 await next_button.scroll_into_view_if_needed() await asyncio.sleep(1) # 点击下一页 await next_button.click() await self.page.wait_for_load_state("networkidle", timeout=15000) # 等待新数据加载 await asyncio.sleep(5) logger.info(f"成功导航到第 {current_page} 页") next_button_found = True break except Exception as e: continue if not next_button_found: logger.warning(f"无法找到下一页按钮,停止在第 {current_page-1} 页") break except Exception as e: logger.error(f"导航到第 {target_page} 页失败: {str(e)}") async def search_multiple_pages(self, keyword: str, total_pages: int = 1) -> Dict[str, Any]: """ 搜索多页闲鱼商品 Args: keyword: 搜索关键词 total_pages: 总页数 Returns: 搜索结果字典,包含所有页面的items列表和总数 """ browser_initialized = False try: if not PLAYWRIGHT_AVAILABLE: logger.error("Playwright 不可用,无法获取真实数据") return { 'items': [], 'total': 0, 'error': 'Playwright 不可用,无法获取真实数据' } logger.info(f"使用 Playwright 搜索多页闲鱼商品: 关键词='{keyword}', 总页数={total_pages}") # 确保浏览器初始化 await self.init_browser() browser_initialized = True # 验证浏览器状态 if not self.browser or not self.page: raise Exception("浏览器初始化失败") logger.info("浏览器初始化成功,开始搜索...") # 清空之前的API响应 self.api_responses = [] all_data_list = [] # 设置API响应监听器 async def on_response(response): """处理API响应,解析数据""" if "h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search" in response.url: try: # 检查响应状态 if response.status != 200: logger.warning(f"API响应状态异常: {response.status}") return # 安全地获取响应内容 try: result_json = await response.json() except Exception as json_error: logger.warning(f"无法解析响应JSON: {str(json_error)}") return self.api_responses.append(result_json) logger.info(f"捕获到API响应,URL: {response.url}") items = result_json.get("data", {}).get("resultList", []) logger.info(f"从API获取到 {len(items)} 条原始数据") for item in items: try: parsed_item = await self._parse_real_item(item) if parsed_item: all_data_list.append(parsed_item) except Exception as parse_error: logger.warning(f"解析单个商品失败: {str(parse_error)}") continue except Exception as e: logger.warning(f"响应处理异常: {str(e)}") try: # 检查浏览器状态 if not self.page or self.page.is_closed(): raise Exception("页面已关闭或不可用") # 获取并设置cookies进行登录 logger.info("正在获取有效的cookies账户...") cookie_data = await self.get_first_valid_cookie() if not cookie_data: raise Exception("未找到有效的cookies账户,请先在Cookie管理中添加有效的闲鱼账户") logger.info(f"使用账户: {cookie_data.get('id', 'unknown')}") logger.info("正在访问闲鱼首页...") await self.page.goto("https://www.goofish.com", timeout=30000) # 设置cookies进行登录 logger.info("正在设置cookies进行登录...") cookie_success = await self.set_browser_cookies(cookie_data.get('value', '')) if not cookie_success: logger.warning("设置cookies失败,将以未登录状态继续") else: logger.info("✅ cookies设置成功,已登录") # 刷新页面以应用cookies await self.page.reload() await asyncio.sleep(2) # 再次检查页面状态 if self.page.is_closed(): raise Exception("页面在导航后被关闭") logger.info("等待页面加载完成...") await self.page.wait_for_load_state("networkidle", timeout=15000) # 等待页面稳定 logger.info("等待页面稳定...") await asyncio.sleep(3) # 增加等待时间 # 再次检查页面状态 if self.page.is_closed(): raise Exception("页面在等待加载后被关闭") # 获取页面标题和URL用于调试 page_title = await self.page.title() page_url = self.page.url logger.info(f"当前页面标题: {page_title}") logger.info(f"当前页面URL: {page_url}") logger.info(f"正在搜索关键词: {keyword}") # 尝试多种搜索框选择器 search_selectors = [ 'input[class*="search-input"]', 'input[placeholder*="搜索"]', 'input[type="text"]', '.search-input', '#search-input' ] search_input = None for selector in search_selectors: try: logger.info(f"尝试查找搜索框,选择器: {selector}") search_input = await self.page.wait_for_selector(selector, timeout=5000) if search_input: logger.info(f"✅ 找到搜索框,使用选择器: {selector}") break except Exception as e: logger.info(f"❌ 选择器 {selector} 未找到搜索框: {str(e)}") continue if not search_input: raise Exception("未找到搜索框元素") # 检查页面状态 if self.page.is_closed(): raise Exception("页面在查找搜索框后被关闭") await search_input.fill(keyword) logger.info(f"✅ 搜索关键词 '{keyword}' 已填入搜索框") # 注册响应监听 self.page.on("response", on_response) logger.info("🖱️ 准备点击搜索按钮...") await self.page.click('button[type="submit"]') logger.info("✅ 搜索按钮已点击") await self.page.wait_for_load_state("networkidle", timeout=15000) # 等待第一页API响应 logger.info("等待第一页API响应...") await asyncio.sleep(10) # 增加等待时间 # 尝试处理弹窗 try: await self.page.keyboard.press('Escape') await asyncio.sleep(1) except: pass # 等待更多数据 await asyncio.sleep(3) first_page_count = len(all_data_list) logger.info(f"第1页完成,获取到 {first_page_count} 条数据") # 如果需要获取更多页数据 if total_pages > 1: for page_num in range(2, total_pages + 1): logger.info(f"正在获取第 {page_num} 页数据...") # 等待页面稳定 await asyncio.sleep(2) # 查找并点击下一页按钮 next_button_found = False next_button_selectors = [ '.search-page-tiny-arrow-right--oXVFaRao', '[class*="search-page-tiny-arrow-right"]', 'button[aria-label="下一页"]', 'button:has-text("下一页")', 'a:has-text("下一页")', '.ant-pagination-next', 'li.ant-pagination-next a', 'a[aria-label="下一页"]' ] for selector in next_button_selectors: try: next_button = self.page.locator(selector).first if await next_button.is_visible(timeout=3000): # 检查按钮是否可点击 is_disabled = await next_button.get_attribute("disabled") has_disabled_class = await next_button.evaluate("el => el.classList.contains('ant-pagination-disabled') || el.classList.contains('disabled')") if not is_disabled and not has_disabled_class: logger.info(f"找到下一页按钮,正在点击到第 {page_num} 页...") # 记录点击前的数据量 before_click_count = len(all_data_list) # 滚动到按钮位置并点击 await next_button.scroll_into_view_if_needed() await asyncio.sleep(1) await next_button.click() await self.page.wait_for_load_state("networkidle", timeout=15000) # 等待新数据加载 await asyncio.sleep(5) # 检查是否有新数据 after_click_count = len(all_data_list) new_items = after_click_count - before_click_count if new_items > 0: logger.info(f"第 {page_num} 页成功,新增 {new_items} 条数据") next_button_found = True break else: logger.warning(f"第 {page_num} 页点击后没有新数据,可能已到最后一页") next_button_found = False break except Exception as e: continue if not next_button_found: logger.warning(f"无法获取第 {page_num} 页数据,停止在第 {page_num-1} 页") break # 根据"人想要"数量进行倒序排列 all_data_list.sort(key=lambda x: x.get('want_count', 0), reverse=True) total_count = len(all_data_list) logger.info(f"多页搜索完成,总共获取到 {total_count} 条真实数据,已按想要人数排序") return { 'items': all_data_list, 'total': total_count, 'is_real_data': True, 'source': 'playwright' } finally: # 确保浏览器被正确关闭 if browser_initialized: try: await self.close_browser() logger.info("浏览器已安全关闭") except Exception as close_error: logger.warning(f"关闭浏览器时出错: {str(close_error)}") except Exception as e: error_msg = str(e) logger.error(f"Playwright 多页搜索失败: {error_msg}") # 检查是否是浏览器相关问题 if "Executable doesn't exist" in error_msg or "playwright install" in error_msg: error_msg = "浏览器未安装。请在Docker容器中运行: playwright install chromium" elif "BrowserType.launch" in error_msg: error_msg = "浏览器启动失败。请确保Docker容器有足够的权限和资源" elif "Target page, context or browser has been closed" in error_msg: error_msg = "浏览器页面被意外关闭。这可能是由于网站反爬虫检测或系统资源限制导致的" elif "Page.goto" in error_msg and "closed" in error_msg: error_msg = "页面导航失败,浏览器连接已断开" elif "Timeout" in error_msg and "exceeded" in error_msg: error_msg = "页面加载超时。网络连接可能不稳定或网站响应缓慢" # 如果 Playwright 失败,返回错误信息 return { 'items': [], 'total': 0, 'error': f'多页搜索失败: {error_msg}' } async def _get_multiple_fallback_data(self, keyword: str, total_pages: int) -> Dict[str, Any]: """获取多页备选数据(模拟数据)""" logger.info(f"使用多页备选数据: 关键词='{keyword}', 总页数={total_pages}") # 模拟搜索延迟 await asyncio.sleep(1) # 生成多页模拟数据 all_mock_items = [] for page in range(1, total_pages + 1): page_size = 20 # 每页20条 start_index = (page - 1) * page_size for i in range(page_size): item_index = start_index + i + 1 all_mock_items.append({ 'item_id': f'mock_{keyword}_{item_index}', 'title': f'{keyword}相关商品 #{item_index} [模拟数据-第{page}页]', 'price': f'{100 + item_index * 10}', 'seller_name': f'卖家{item_index}', 'item_url': f'https://www.goofish.com/item?id=mock_{keyword}_{item_index}', 'publish_time': '2025-07-28', 'tags': [f'标签{i+1}', f'分类{i+1}'], 'main_image': f'https://via.placeholder.com/200x200?text={keyword}商品{item_index}', 'raw_data': { 'mock': True, 'keyword': keyword, 'index': item_index, 'page': page } }) total_count = len(all_mock_items) logger.info(f"多页备选数据生成完成: 找到{total_count}个商品,共{total_pages}页") return { 'items': all_mock_items, 'total': total_count, 'is_fallback': True } async def _parse_item_old(self, item_data: Dict[str, Any]) -> Optional[Dict[str, Any]]: """ 解析单个商品数据 Args: item_data: 商品原始数据 Returns: 解析后的商品信息 """ try: # 基本信息 - 适配多种可能的字段名 item_id = (item_data.get('id') or item_data.get('itemId') or item_data.get('item_id') or item_data.get('spuId') or '') title = (item_data.get('title') or item_data.get('name') or item_data.get('itemTitle') or item_data.get('subject') or '') # 价格处理 price_raw = (item_data.get('price') or item_data.get('priceText') or item_data.get('currentPrice') or item_data.get('realPrice') or '') # 清理价格格式 if isinstance(price_raw, (int, float)): price = str(price_raw) elif isinstance(price_raw, str): price = price_raw.replace('¥', '').replace('元', '').strip() else: price = '价格面议' # 卖家信息 seller_info = (item_data.get('seller') or item_data.get('user') or item_data.get('owner') or {}) seller_name = '' if seller_info: seller_name = (seller_info.get('nick') or seller_info.get('nickname') or seller_info.get('name') or seller_info.get('userName') or '匿名用户') # 商品链接 if item_id: item_url = f"https://www.goofish.com/item?id={item_id}" else: item_url = item_data.get('url', item_data.get('link', '')) # 发布时间 publish_time = (item_data.get('publishTime') or item_data.get('createTime') or item_data.get('gmtCreate') or item_data.get('time') or '') # 格式化时间 if publish_time and isinstance(publish_time, (int, float)): import datetime publish_time = datetime.datetime.fromtimestamp(publish_time / 1000).strftime('%Y-%m-%d') # 商品标签 tags = item_data.get('tags', item_data.get('labels', [])) tag_names = [] if isinstance(tags, list): for tag in tags: if isinstance(tag, dict): tag_name = (tag.get('name') or tag.get('text') or tag.get('label') or '') if tag_name: tag_names.append(tag_name) elif isinstance(tag, str): tag_names.append(tag) # 图片 images = (item_data.get('images') or item_data.get('pics') or item_data.get('pictures') or item_data.get('imgList') or []) main_image = '' if images and len(images) > 0: if isinstance(images[0], str): main_image = images[0] elif isinstance(images[0], dict): main_image = (images[0].get('url') or images[0].get('src') or images[0].get('image') or '') # 如果没有图片,使用默认占位图 if not main_image: main_image = f'https://via.placeholder.com/200x200?text={title[:10] if title else "商品"}...' return { 'item_id': item_id, 'title': title, 'price': price, 'seller_name': seller_name, 'item_url': item_url, 'publish_time': publish_time, 'tags': tag_names, 'main_image': main_image, 'raw_data': item_data # 保留原始数据以备后用 } except Exception as e: logger.warning(f"解析商品数据失败: {str(e)}") return None # 搜索器工具函数 async def search_xianyu_items(keyword: str, page: int = 1, page_size: int = 20) -> Dict[str, Any]: """ 搜索闲鱼商品的便捷函数,带重试机制 Args: keyword: 搜索关键词 page: 页码 page_size: 每页数量 Returns: 搜索结果 """ max_retries = 2 retry_delay = 5 # 秒,增加重试间隔 for attempt in range(max_retries + 1): searcher = None try: # 每次搜索都创建新的搜索器实例,避免浏览器状态混乱 searcher = XianyuSearcher() logger.info(f"开始单页搜索,尝试次数: {attempt + 1}/{max_retries + 1}") result = await searcher.search_items(keyword, page, page_size) # 如果成功获取到数据,直接返回 if result.get('items') or not result.get('error'): logger.info(f"单页搜索成功,获取到 {len(result.get('items', []))} 条数据") return result except Exception as e: error_msg = str(e) logger.error(f"搜索商品失败 (尝试 {attempt + 1}/{max_retries + 1}): {error_msg}") # 如果是最后一次尝试,返回错误 if attempt == max_retries: return { 'items': [], 'total': 0, 'error': f"搜索失败,已重试 {max_retries} 次: {error_msg}" } # 等待后重试 logger.info(f"等待 {retry_delay} 秒后重试...") await asyncio.sleep(retry_delay) finally: # 确保搜索器被正确关闭 if searcher: try: await searcher.close_browser() except Exception as close_error: logger.warning(f"关闭搜索器时出错: {str(close_error)}") # 理论上不会到达这里 return { 'items': [], 'total': 0, 'error': "未知错误" } async def search_multiple_pages_xianyu(keyword: str, total_pages: int = 1) -> Dict[str, Any]: """ 搜索多页闲鱼商品的便捷函数,带重试机制 Args: keyword: 搜索关键词 total_pages: 总页数 Returns: 搜索结果 """ max_retries = 2 retry_delay = 5 # 秒,增加重试间隔 for attempt in range(max_retries + 1): searcher = None try: # 每次搜索都创建新的搜索器实例,避免浏览器状态混乱 searcher = XianyuSearcher() logger.info(f"开始多页搜索,尝试次数: {attempt + 1}/{max_retries + 1}") result = await searcher.search_multiple_pages(keyword, total_pages) # 如果成功获取到数据,直接返回 if result.get('items') or not result.get('error'): logger.info(f"多页搜索成功,获取到 {len(result.get('items', []))} 条数据") return result except Exception as e: error_msg = str(e) logger.error(f"多页搜索商品失败 (尝试 {attempt + 1}/{max_retries + 1}): {error_msg}") # 如果是最后一次尝试,返回错误 if attempt == max_retries: return { 'items': [], 'total': 0, 'error': f"搜索失败,已重试 {max_retries} 次: {error_msg}" } # 等待后重试 logger.info(f"等待 {retry_delay} 秒后重试...") await asyncio.sleep(retry_delay) finally: # 确保搜索器被正确关闭 if searcher: try: await searcher.close_browser() except Exception as close_error: logger.warning(f"关闭搜索器时出错: {str(close_error)}") # 理论上不会到达这里 return { 'items': [], 'total': 0, 'error': "未知错误" } async def get_item_detail_from_api(item_id: str) -> Optional[str]: """ 从外部API获取商品详情 Args: item_id: 商品ID Returns: 商品详情文本,获取失败返回None """ try: # 使用默认的API配置 api_base_url = 'https://selfapi.zhinianboke.com/api/getItemDetail' timeout_seconds = 10 api_url = f"{api_base_url}/{item_id}" logger.info(f"正在从外部API获取商品详情: {item_id}") # 使用简单的HTTP请求 import aiohttp timeout = aiohttp.ClientTimeout(total=timeout_seconds) async with aiohttp.ClientSession(timeout=timeout) as session: async with session.get(api_url) as response: if response.status == 200: result = await response.json() # 检查返回状态 if result.get('status') == '200' and result.get('data'): item_detail = result['data'] logger.info(f"成功获取商品详情: {item_id}, 长度: {len(item_detail)}") return item_detail else: logger.warning(f"API返回状态异常: {result.get('status')}, message: {result.get('message')}") return None else: logger.warning(f"API请求失败: HTTP {response.status}") return None except asyncio.TimeoutError: logger.warning(f"获取商品详情超时: {item_id}") return None except Exception as e: logger.error(f"获取商品详情异常: {item_id}, 错误: {str(e)}") return None