Example
Last updated
Last updated
Before you test these examples, you need to install some dependency packages
pip install webdriver-manager
pip install ddddocr
pip install --upgrade crawlist
Before using the script, you need to configure your own webdrvier.
import random
import unittest
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver as wd
from selenium.webdriver.chrome.service import Service
import crawlipt as cpt
import crawlist as cl
import ddddocr as docr
def get_driver(is_headless=False):
option = wd.ChromeOptions()
arguments = [
"no-sandbox",
"--disable-extensions",
'--disable-gpu',
'User-Agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"',
"window-size=1920x3000",
"start-maximized",
'cache-control="max-age=0"'
"disable-blink-features=AutomationControlled"
]
for argument in arguments:
option.add_argument(argument)
if is_headless:
option.add_argument("--headless")
option.add_experimental_option('excludeSwitches', ['enable-automation'])
webdriver = wd.Chrome(service=Service(ChromeDriverManager().install()), options=option)
webdriver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
return webdriver
Search for "Baidu Tieba" on Baidu
webdriver = get_driver()
script = {
"method": "redirect",
"url": "https://www.baidu.com/",
"next": {
"method": "input",
"xpath": "//*[@id=\"kw\"]",
"text": "Baidu Tieba",
"next": {
"method": "click",
"xpath": "//*[@id=\"su\"]"
}
}
}
cpt.Script(script, interval=0.1)(webdriver)
webdriver.quit()
Use Baidu Translate and return the translation result
webdriver = get_driver(is_headless=True)
step = [{
"method": "redirect",
"url": "https://fanyi.baidu.com/mtpe-individual/multimodal#/",
}, {
"method": "input",
"xpath": "//*[@id=\"editor-text\"]/div[1]/div[1]/div/div/div/div",
"text": "Hello, World",
}, {
"method": "getInnerText",
"xpath": "//*[@id=\"trans-selection\"]/div/span",
}]
script = cpt.Script.generate(step)
result = cpt.Script(script, interval=0.1)(webdriver)
print(result)
webdriver.quit()
webdriver = get_driver()
step = [{
"method": "redirect",
"url": "https://www.psy525.cn/ceshi/84307.html",
}, {
"method": "click",
"xpath": "//*[@id=\"fun\"]/a"
}]
for i in range(90):
index = random.randint(1, 5)
step.append({
"method": "click",
"xpath": f"//*[@id=\"question_{i + 1}\"]/fieldset/ul/li[{index}]"
})
scripts = cpt.Script.generate(step)
json = cpt.Script.generate_json(scripts)
print(json)
cpt.Script(scripts, interval=1)(webdriver)
webdriver.quit()
webdriver = get_driver()
step = [{
"method": "redirect",
"url": "https://artsandculture.google.com/",
},{
"method": "click",
"xpath": "//*[@id=\"yDmH0d\"]/div[2]/div[1]/div[3]/div[2]/span/span",
},{
"method": "getInnerText",
"xpath": "//*[@id=\"yDmH0d\"]/div[2]/div[2]/div/div[3]/div/ul/div/li[1]/a"
},{
"method": "input",
"xpath": "//*[@id=\"yDmH0d\"]/div[2]/div[2]/div/input",
"text": "__PRE_RETURN__",
},{
"method": "enter",
"xpath": "//*[@id=\"yDmH0d\"]/div[2]/div[2]/div/input",
}]
scripts = cpt.Script.generate(step)
cpt.Script(scripts, interval=1)(webdriver)
webdriver.quit()
Add your own action method and use ddddocr to crack the verification code, return the cracking result, and pass it to the next action method.
webdriver = get_driver()
@cpt.check(exclude="driver")
@cpt.alias("captcha")
def crackCaptcha(driver: WebDriver, xpath: str) -> str:
"""
Handling keyboard input events
:param driver: selenium webdriver
:param xpath: The xpath path of the captcha
"""
element = driver.find_element(By.XPATH, xpath)
pic = element.screenshot_as_png
ocr = docr.DdddOcr(show_ad=False)
res = ocr.classification(pic)
return res
cpt.Script.add_action(crackCaptcha)
step = [{
"method": "redirect",
"url": "http://www.shuhai.com/login",
},{
"method": "input",
"xpath": "//*[@id=\"login_form\"]/div[2]/div[1]/div[2]/input",
"text": "username",
},{
"method": "input",
"xpath": "//*[@id=\"login_form\"]/div[2]/div[2]/div[2]/input",
"text": "password",
},{
"method": "crackCaptcha", # or alias: "method": "captcha"
"xpath": "//*[@id=\"checkcode2\"]",
},{
"method": "input",
"xpath": "//*[@id=\"login_form\"]/div[2]/div[3]/div[2]/input",
"text" : "__PRE_RETURN__"
},{
"method": "click",
"xpath": "//*[@id=\"dosubmit\"]",
}]
scripts = cpt.Script.generate(step)
cpt.Script(scripts, interval=3)(webdriver)
webdriver.quit()
Use if to determine whether input is needed in the input box
webdriver = get_driver()
step = [{
"method": "redirect",
"url": "https://www.baidu.com/",
}, {
"method": "input",
"xpath": "//*[@id=\"kw\"]",
"text": "your search text",
"if": {
"condition": "presence",
"xpath": "//*[@id=\"su\"]"
}
}, {
"method": "input",
"xpath": "//*[@id=\"kw\"]",
"text": "your search text",
}]
cpt.Script(step, interval=3)
Add your own condition method for addition and subtraction counting, and return the final result
webdriver = get_driver()
@cpt.check(exclude="driver")
@cpt.alias("check")
def checkNum(driver: WebDriver, xpath: str) -> bool:
"""
your doc
:param driver: selenium webdriver
:param xpath: the xpath of element
"""
element = driver.find_element(By.XPATH, xpath)
value = int(element.get_attribute("value"))
if value > 10:
return False
else:
return True
cpt.Script.add_condition(checkNum)
step = [{
"method": "redirect",
"url": "https://www.bchrt.com/tools/click-counter/",
}, {
"loop": {
"while": {
"condition": "checkNum", # or alias: "condition": "check",
"xpath": "//*[@id=\"count\"]"
},
"script": [{
"loop": {
"cnt": 5,
"script": {
"method": "click",
"xpath": "//*[@id=\"addbtn\"]",
},
}
},
{
"method": "click",
"xpath": "//*[@id=\"subbtn\"]",
}
]
}
}, {
"method": "getAttribute",
"xpath": "//*[@id=\"count\"]",
"name": "value"
}]
json_str = cpt.Script.generate_json(step)
res = cpt.Script(json_str)(webdriver)
print(res)
webdriver.quit()
Using variables for Baidu search
webdriver = get_driver()
step = [{
"method": "redirect",
"url": "https://www.baidu.com/",
}, {
"method": "input",
"xpath": "//*[@id=\"kw\"]",
"text": "__v-searchKey__",
"if": {
"condition": "presence",
"xpath": "__v-button_xpath__"
}
}, {
"method": "clear"
}]
v1 = cpt.Variable({
"searchKey": "hello",
"button_xpath": "//*[@id=\"su\"]"
})
v2 = cpt.Variable({
"searchKey": "world",
"button_xpath": "//*[@id=\"su\"]"
})
v3 = cpt.Variable({
"searchKey": "world",
"button_xpath": "//*[@id=\"su_no_existence\"]"
})
loader = cpt.Script(step, interval=3)
loader.process(webdriver=webdriver,
variable=v1)
loader.process(webdriver=webdriver,
variable=v2)
loader.process(webdriver=webdriver,
variable=v3)
webdriver.quit()
Use crawlist to crawl web page list information and store it in the store
class MyStore(cpt.StoreBase):
def __init__(self):
self.data = []
@cpt.check(exclude=["driver", "store"])
def crawl_baidu_list(driver: WebDriver, store: MyStore, limit: int) -> None:
if not driver:
return None
pager = MyPager(button_selector=cl.XpathWebElementSelector('//*[@id="page"]/div/a/span'),
webdriver=driver, interval=2)
selector = cl.CssSelector(pattern="#content_left > div")
analyzer = cl.AnalyzerPrettify(pager, selector)
for e in analyzer(limit):
store.data.append(e)
cpt.Script.add_action(crawl_baidu_list)
webdriver = get_driver(is_headless=True)
step = [{
"method": "redirect",
"url": "https://www.baidu.com/",
}, {
"method": "input",
"xpath": "//*[@id=\"kw\"]",
"text": "__v-keyword__",
}, {
"method": "click",
"xpath": "//*[@id=\"su\"]"
}, {
"method": "crawl_baidu_list",
"limit": "__v-limit__",
},{
"method": "clear"
}]
v1 = cpt.Variable({
"limit": 20,
"keyword": "和泉雾纱"
})
store1 = MyStore()
v2 = cpt.Variable({
"limit": 20,
"keyword": "python"
})
store2 = MyStore()
loader = cpt.Script(step, interval=1)
loader.process(webdriver=webdriver, store=store1, variable=v1)
print(store1.data)
loader.process(webdriver=webdriver, store=store2, variable=v2)
print(store2.data)
webdriver.quit()
Execute JavaScript code, return any type, and use built-in store and variable in combination
js_code1 = '''
var element = document.querySelector("body > div > main > div.row.justify-content-center.pt-2.pb-3.-bg-selenium-cyan > div > div > h2");
return element.innerText;
'''
js_code2 = '''
return 1
'''
step = [{
"method": "redirect",
"url": "https://www.selenium.dev/",
}, {
"method": "execute",
"js": "__v-js_code__",
}, {
"method": "log",
"msg": "__PRE_RETURN__"
}]
v1 = cpt.Variable({
"js_code": js_code1,
})
v2 = cpt.Variable({
"js_code": js_code2,
})
webdriver = get_driver(is_headless=True)
loader = cpt.Script(step, interval=3)
s = cpt.Store(is_replace=True)
print(type(loader.process(webdriver, variable=v1, store=s)))
print(type(loader.process(webdriver, variable=v2, store=s)))
print(s.data)
webdriver.quit()