利用playwright爬取boss直聘岗位信息
介绍
实训课上做的
老师用selenium,我用的playwright
感觉也还行,可以作为学习使用
环境
python3.10+(我也不知道以前的python能不能安装playwright)
playwright
pandas
playwright安装方式
python环境安装好后:
命令行:
pip install playwright
等待完成后继续:
playwright install
等待完成即可
pandas安装方式
命令行:
pip install pandas
等待完成即可
代码
import time
from typing import List, Any
from playwright.sync_api import Playwright, sync_playwright, expect
import pandas as pd
# def SearchWrok():
def run(playwright: Playwright,jobname,pageNum) -> list[list[Any]]:
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
Jobs_Info_List = []
for i in range(1, pageNum):
page.goto(f"https://www.zhipin.com/web/geek/job?query={jobname}&city=101100100&page={i}")
time.sleep(10)
Jobs = page.query_selector_all(".job-card-wrapper")
# print(Jobs)
# Job = Jobs[0]
for Job in Jobs:
company_name = Job.query_selector('.company-name').text_content()
company_tag_list = Job.query_selector('.company-tag-list').text_content()
company_desc = Job.query_selector('.info-desc').text_content()
job_name = Job.query_selector('.job-name').text_content()
job_area = Job.query_selector('.job-area').text_content()
job_salary = Job.query_selector('.salary').text_content()
job_ask = Job.query_selector('.job-info .tag-list').text_content()
job_tag = Job.query_selector('.job-card-footer .tag-list').text_content()
# TagList = Job.query_selector('.tag-list').inner_text()
Jobs_Info_List.append([company_name,job_name,job_area,job_salary, job_ask, company_tag_list ,job_tag , company_desc])
context.close()
browser.close()
return Jobs_Info_List
with sync_playwright() as playwright:
# 爬取页数
pageNum = 3
# 搜索工作名称
jobname = '数据分析'
data = run(playwright, jobname, pageNum)
ls = pd.DataFrame(data, columns=['公司名称', '工作名称', '工作区域', '薪酬', '职位要求', '公司信息', '技能要求', '公司福利'])
print(ls)
ls.to_excel('数据分析师.xlsx')
阅读剩余
版权声明:
作者:admin
链接:https://www.denceun.com/archives/369
文章版权归作者所有,未经允许请勿转载。
THE END