Selenium加载用户目录爬取某宝电商数据

发布时间：2025-04-23 19:20:27编辑：123阅读（1726）

Selenium加载用户目录爬取某宝电商数据

Selenium 通过使用 WebDriver 支持市场上所有主流浏览器的自动化。 WebDriver 是一个 API 和协议，它定义了一个语言中立的接口，用于控制 web 浏览器的行为。每个浏览器都有一个特定的 WebDriver 实现，称为驱动程序。驱动程序是负责委派给浏览器的组件，并处理与 Selenium 和浏览器之间的通信。

这种分离是有意识地努力让浏览器供应商为其浏览器的实现负责的一部分。 Selenium 在可能的情况下使用这些第三方驱动程序，但是在这些驱动程序不存在的情况下，它也提供了由项目自己维护的驱动程序。

Selenium 框架通过一个面向用户的界面将所有这些部分连接在一起，该界面允许透明地使用不同的浏览器后端，从而实现跨浏览器和跨平台自动化。

Selenium的设置与其他商业工具有很大不同. 在开始编写 Selenium 代码之前, 您必须安装所选语言的相关类库, 目标浏览器的驱动程序.

Selenium安装

pip install selenium -i https://mirrors.aliyun.com/pypi/simple

查看chrome浏览器版本，chrome://version/

下载对应的chromedriver版本：https://googlechromelabs.github.io/chrome-for-testing/

执行代码前，先手动登录下某宝，爬取某宝的服装数据(电商服装数据集给多模态模型微调用的)，代码如下：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import requests
import hashlib
import os
import json

option = webdriver.ChromeOptions()
# 指定 chromedriver.exe 的路径
service = Service(executable_path=r'D:\Python39\chromedriver.exe')
# 设置成用户自己的数据目录，一定时间内可以免登录
option.add_argument(r'--user-data-dir=C:\Users\Sam\AppData\Local\Google\Chrome\User Data')
option.add_argument('--profile-directory=Default')
option.add_argument('--start-maximized')
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--disable-gpu")
option.add_argument("--no-sandbox")
option.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"')
browser = webdriver.Chrome(service=service, options=option)
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": """
            Object.defineProperty(navigator, 'webdriver', {
              get: () => undefined
            })
            """
})

search_keyword = '2025年最流行的新款女装上衣'


def get_list_page(url):
    """
    获取商品详情url
    :param url:
    :return: detail_pages_urls list
    """
    try:
        browser.get(url)
        browser.implicitly_wait(60)
        # 点击搜索女衣
        browser.find_element(By.XPATH, "//input[@id='q']").send_keys(search_keyword)
        browser.find_element(By.XPATH, "//button[@class='btn-search tb-bg']").click()
        browser.implicitly_wait(120)
        # 获取所有窗口的句柄
        windows = browser.window_handles
        # 切换到最新打开的窗口
        browser.switch_to.window(windows[-1])
        response = browser.find_elements(By.XPATH, "//div[@id='content_items_wrapper']/div/a")
        detail_pages_urls = []
        for i in response:
            # 商品详情url  http://item.jd.com/100192744740.html
            detail_pages_urls.append(i.get_attribute("href"))
        print(detail_pages_urls)
        return detail_pages_urls
    except Exception as e:
        print(e)
        browser.quit()

def get_product_Details(detail_pages_urls):
    for _url in detail_pages_urls:
        browser.get(_url)
        browser.implicitly_wait(90)
        title = browser.find_element(By.XPATH, "//div[@id='tbpc-detail-item-title']/h1").text
        print(title)
        image_url = browser.find_element(By.XPATH,"//img[@class='QJEEHAN8H5--thumbnailPic--_2b4183e']").get_attribute("src")
        print(image_url)
        # 下载图片到本地，返回一个image存放路径
        image_abs_path = download_image(image_url)
        # 参数信息
        p_i_ret = browser.find_elements(By.XPATH, "//div[@class='QJEEHAN8H5--infoItem--_6d170e8']")
        all_product_introduction = get_describe_text(p_i_ret)
        print(all_product_introduction)
        # 生成jsonl数据集
        write_jsonl(title, image_abs_path, all_product_introduction)
    browser.quit()

def get_describe_text(p_i_ret):
    all_product_introduction = ''
    for i in p_i_ret:
        text1 = i.find_element(By.XPATH, "./div[1]").text
        text2 = i.find_element(By.XPATH, "./div[2]").text
        product_introduction = text1 + ':' + text2
        all_product_introduction += product_introduction + ','
    return all_product_introduction


def download_image(image_url):
    image_dir = r'D:\SpiderTaobao\images'
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url=image_url, headers=headers, stream=True, timeout=30)
        img = response.content
        image_name = hashlib.md5(img).hexdigest() + '.jpg'
        image_abs_path = os.path.join(image_dir, image_name)
        with open(image_abs_path, 'wb') as f:
            f.write(img)
        return image_abs_path
    except requests.exceptions.RequestException as e:
        print(e)


def write_jsonl(title, image_abs_path, all_product_introduction):
    """
    生成jsonl  多模态训练数据集
    :param data:
    :return: _jsonl
    """
    _jsonl = {
        'messages': [
            {
                'role': 'user',
                'content': '描述这张图片，给出详细的信息。',
            },
            {
                'role': 'assistant',
                'content': '',
            }
        ],
        'images':[],
    }
    _jsonl.get('messages')[1]['content'] = title + ',' + all_product_introduction
    _jsonl.get('images').append(image_abs_path)
    with open(r'taobao_women_clothing.jsonl', mode='a', encoding='utf-8') as fb:
        fb.write(json.dumps(_jsonl, ensure_ascii=False))
        fb.write('\n')


if __name__ == '__main__':
    url = "https://www.taobao.com/"
    detail_pages_urls = get_list_page(url)
    get_product_Details(detail_pages_urls)

结果如下：

关键字：

上一篇： selenium 无头模式以及防止被检测

下一篇：没有了



搜索

热门推荐

最新文章

Python搭建一个RAG系统(分片/检索/召回/重排序/生成)
 1026°
Browser-use:智能浏览器自动化(Web-Agent)
 1708°
使用 LangChain 实现本地 Agent
 1399°
使用 LangChain 构建本地 RAG 应用
 1331°
使用LLaMA-Factory微调大模型的function calling能力
 1593°
复现一个简单Agent系统
 1428°
LLaMA Factory-Lora微调实现声控语音多轮问答对话-1
 2079°
LLaMA Factory微调后的模型合并导出和部署-4
 3623°
LLaMA Factory微调模型的各种参数怎么设置-3
 3541°
LLaMA Factory构建高质量数据集-2
 2498°

博主信息

姓名：Run
职业：谜
邮箱：383697894@qq.com
定位：上海 · 松江

扫我打开

友情链接

百度 淘宝 腾讯 慕课网 CSDN 博客园 51cto博客