基本操作
cookie的处理#
手动处理
cookie从抓包工具中捕获封装到headers中
自动处理
session对象(发两次,第二次如果有cookie自动存)
import requestsheaders = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}# 基于session自动处理cookiesess = requests.Session()# 该次请求只是为了捕获cookie存储到sess中sess.get(url='https://xueqiu.com/',headers=headers)url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20367942&count=15&category=-1'json_data = sess.get(url=url,headers=headers).json()print(json_data)
代理#
- 代理服务器
- 进行请求转发
- 代理ip+post 作用到get、post方法的proxies={'http':'ip:port'}中
- 代理池(列表)
构建一个代理池
url = '代理IP提取网址'page_text = requests.get(url=url,headers=headers).texttree = etree.HTML(page_text)ip_list = tree.xpath('//body//text()')for ip in ip_list:dic = {'https':ip}ips_list.append(dic)使用代理池操作
import randomurl = 'https://www.xicidaili.com/nn/%d'all_data = []for page in range(1,30):new_url = format(url%page)# proxies={'http':'ip:port'}page_text = requests.get(url=new_url,headers=headers,proxies=random.choice(ips_list)).texttree = etree.HTML(page_text)# 在xpath表达式中不可以出现tbody标签,否则会出问题tr_list = tree.xpath('//*[@id="ip_list"]//tr')[1:]for tr in tr_list:ip_addr = tr.xpath('./td[2]/text()')[0]all_data.append(ip_addr)print(len(all_data))
验证码的识别#
超级鹰
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
def get_text(imgPath, imgType):
chaojiying = Chaojiying_Client('jianqingmin', 'jqmkfc039988', '910948')
im = open(imgPath, 'rb').read()
return chaojiying.PostPic(im, imgType)['pic_str']
模拟登陆#
- 验证码的识别
- 动态请求参数
- cookie
单线程 + 多任务异步协程#
协程
如果一个函数的定义被asyic修饰后,则该函数调用后会返回一个协程对象
任务对象
就是对协程对象的进一步封装,有回调
绑定回调
task.add_done_callback(func):带参数func(tack):返回值task.result()
事件循环对象
时间循环对象是用来装载任务对象的。该对象被启动后,则会异步的处理调用其内部装载的每一个任务对象(将任务对象手动进行挂起操作)
aynic,await
注意事项:在特殊函数内部不可以出现不支持异步模块的代码,否则会中断整个异步的效果
aiohttp支持异步请求的模块#
import aiohttp
import asyncio
from lxml import etree
urls = [
'http:127.0.0.1:500/test1',
'http:127.0.0.1:500/test2',
'http:127.0.0.1:500/test3',
]
# 特殊的函数:请求发送和响应数据的捕获
# 细节:在每一个堵塞操作的前面加上async,在每一个阻塞操作的前边加上await
async def get_request(url):
with aiohttp.ClientSession() as s:
# s.get(url, header=header, proxy='http://ip:post', params)
with await s.get(url) as response:
page_text = await response.text() # read() 返回的是byte类型的数据
return page_text
# 回调函数(数据解析+持久性存储写在这里)
def parse(task):
page_text = task.result()
tree = etree.HTML(page_text)
parse_data = tree.xpath('//li/text()')
print(parse_data)
tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
selenium模块在爬虫中的使用#
概念:是一个基于浏览器自动化的模块。
爬虫之间的关联:(之前用requests需要先判断网站是否是动态加载的,所见非即可得)
便捷的捕获到动态加载到的数据。(所见即可得)
实现模拟登陆#
环境安装:
pip install selenium
准备好某一款浏览器的驱动程序
驱动下载地址:http://chromedriver.storage.googleapis.com/index.html
版本映射表:https://blog.csdn.net/huilan_same/article/details/51896672
基本使用
from selenium import webdriver
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.jd.com/')
# 进行标签定位
search_input = bro.find_element_by_id('key')
search_input.send_keys('小米笔记本')
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
# 执行js
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# 拿到当前页面文本
page_text = bro.page_source
print(page_text)
bro.quit()
爬取动态加载的数据
from selenium import webdriver
from lxml import etree
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('http://scxk.nmpa.gov.cn:81/xk/')
# 拿到当前页面文本
page_text = bro.page_source
page_text_list = [page_text]
for i in range(3):
bro.find_element_by_id('pageIto_next').click()
page_text_list.append(bro.page_source)
for page_text in page_text_list:
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="gzlist"]/li')
for li in li_list:
title = li.xpath('./dl/@title')[0]
num = li.xpath('./ol/@title')[0]
print(title + ':' + num)
bro.quit()
动作链
一系列连续的动作
在实现标签定位时,如果发现定位的标签是存在于iframe标签之中的,则在定位时必须执行一个固定的操作:bro.switch_to.frame('id')
12306模拟登录
from selenium import webdriverfrom selenium.webdriver import ActionChainsfrom PIL import Imagefrom CJY import Chaojiying_Clientfrom time import sleepbro = webdriver.Chrome(executable_path='chromedriver.exe')bro.get('https://kyfw.12306.cn/otn/login/init')# 整体截图bro.save_screenshot('main.png')# 定位到验证码code_img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')# 左下角坐标x,y {'x': 256, 'y': 274}location = code_img_tag.location# 高度宽度 {'height': 190, 'width': 293}size = code_img_tag.size# 裁剪的区域范围 (256, 274, 549, 464)rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))# 读取整张页面的图片i = Image.open('main.png')# 裁剪frame = i.crop(rangle)# 保存验证码图片sleep(1)frame.save('code.png')# 使用超级鹰识别def get_text(imgPath, imgType):chaojiying = Chaojiying_Client('jianqingmin', 'jqmkfc039988', '910948')im = open(imgPath, 'rb').read()return chaojiying.PostPic(im, imgType)['pic_str']result = get_text('code.png', 9004)# 格式化成[[],[]]all_list = []if '|' in result:list_1 = result.split('|')count_1 = len(list_1)for i in range(count_1):xy_list = []x = int(list_1[i].split(',')[0])y = int(list_1[i].split(',')[1])xy_list.append(x)xy_list.append(y)all_list.append(xy_list)else:x = int(result.split(',')[0])y = int(result.split(',')[1])xy_list = []xy_list.append(x)xy_list.append(y)all_list.append(xy_list)# 动作链 点击验证码acition = ActionChains(bro)for a in all_list:x = a[0]y = a[1]acition.move_to_element_with_offset(code_img_tag, x, y).click().perform()sleep(1)acition.release()无头浏览器的操作,无可视化界面的浏览器
from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionschrome_options = Options()chrome_options.add_argument('--headless')chrome_options.add_argument('--disable-gou')driver = webdriver.Chrome('chromedriver.exe', chrome_options=chrome_options)driver.get('https://kyfw.12306.cn/otn/login/init')如何规避selenium被检测 【已失效】
from selenium import webdriverfrom selenium.webdriver import ChromeOptionsoptions = ChromeOptions()options.add_experimental_option('excludeSwitches', ['enable-automation'])driver = webdriver.Chrome('chromedriver.exe', options=options)driver.get('https://www.taobao.com/')