1、requests的post请求
# 1、requests 的post请求import requestsimport reheaders={ 'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'}response=requests.get(url='https://github.com/login',headers=headers)print(response.text)把login页返回的cookies信息转换成字典login_cookies=response.cookies.get_dict()authenticity_token=re.findall(' name="authenticity_token" value="(.*?)"',response.text,re.S)[0]print(authenticity_token)#拼接请求头信息headers2={ 'Referer':'https://github.com/login', 'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'}#拼接请求体信息form_data={ 'commit':'Sign in', "utf-8":"✓", "authenticity_token":authenticity_token, "login":"852653835", "password":"******", "webauthn-support":"supported"}# 往session发送请求,携带请求头、请求体、login页的cookies信息response2=requests.post(url='https://github.com/session',data=form_data,headers=headers2,cookies=login_cookies)print(response2.status_code)with open('github.html','w',encoding='utf-8')as f: f.write(response2.text)
#响应responseimport requestsheaders = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',}#response = requests.get('https://www.github.com', headers=headers)print(response.status_code) # 获取响应状态码print(response.url) # 获取url地址print(response.text) # 获取文本print(response.content) # 获取二进制流print(response.headers) # 获取页面请求头信息print(response.history) # 上一次跳转的地址print(response.cookies) # # 获取cookies信息print(response.cookies.get_dict()) # 获取cookies信息转换成字典print(response.cookies.items()) # 获取cookies信息转换成字典print(response.encoding) # 字符编码print(response.elapsed) # 访问时间
2、requests的高级用法
# 2、requests高级用法# https=http+sslimport requests#王音频地址发送get请求url='http://hc.yinyuetai.com/uploads/videos/common/3B7201685F78BF2954FEEB32CB6EBD82.mp4'response=requests.get(url,stream=True) #stream=True把content设置为一个迭代器对象print(response.content)with open('music.mp4','wb')as f: for content in response.iter_content(): f.write(content)
3、selenium模块
''''''例1'''from selenium import webdriver # 用来驱动浏览器的from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR,By.ClassNamefrom selenium.webdriver.common.keys import Keys # 键盘按键操作from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素#import time# 通过谷歌浏览器驱动打谷歌浏览器# 1、webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe')chrome=webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe')chrome.get('https://www.cnblogs.com/kermitjam')#2、chrome=webdriver.Chrome()若try出现异常try: 驱动一参数对象,驱动二等待时间 wait=WebDriverWait(chrome,10) 访问百度 chrome.get('https://www.baidu.com') 查找input输入窗 input_tag=wait.until(EC.presence_of_element_located( 此处可以写一个参数 (By.ID,'kw')))#没ID找class 搜索一拳超人 input_tag.send_keys('一拳超人') 按键盘回车键 input_tag.send_keys(Keys.ENTER) time.sleep(3)finally: chrome.close()
from selenium import webdriver # 用来驱动浏览器的from selenium.webdriver import ActionChains # 破解滑动验证码的时候用的 可以拖动图片from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR,By.ClassNamefrom selenium.webdriver.common.keys import Keys # 键盘按键操作from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素import time通过谷歌浏览器驱动打开谷歌浏览器1、webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe')chrome=webdriver.Chrome(r'C:\Users\85265\Downloads\chromedriver.exe')chrome.get('https://www.cnblogs.com/kermitjam')#2、chrome=webdriver.Chrome()若try出现异常try: 显示等待(等待某个元素加载),驱动一参数对象,驱动二等待时间 wait=WebDriverWait(chrome,10) 访问百度 chrome.get('https://www.jd.com') 查找input输入窗 input_tag=wait.until(EC.presence_of_element_located( 此处可以写一个参数 (By.ID,'key')))#没ID找class 搜索唐诗三百首 input_tag.send_keys('唐诗三百首') 根据class属性名查找标签 search_button=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'button'))) search_button.click() time.sleep(8)finally: chrome.close()
4、今日作业:
'''''''''爬取京东商品信息: 请求url: https://www.jd.com/ 提取商品信息: 1.商品详情页 2.商品名称 3.商品价格 4.评价人数 5.商品商家'''from selenium import webdriverfrom selenium.webdriver.common.keys import Keysimport timedef get_good(driver): try: # 通过JS控制滚轮滑动获取所有商品信息 js_code = ''' window.scrollTo(0,5000); ''' driver.execute_script(js_code) # 执行js代码 # 等待数据加载 time.sleep(2) # 3、查找所有商品div # good_div = driver.find_element_by_id('J_goodsList') good_list = driver.find_elements_by_class_name('gl-item') n = 1 for good in good_list: # 根据属性选择器查找 # 商品链接 good_url = good.find_element_by_css_selector( '.p-img a').get_attribute('href') # 商品名称 good_name = good.find_element_by_css_selector( '.p-name em').text.replace("\n", "--") # 商品价格 good_price = good.find_element_by_class_name( 'p-price').text.replace("\n", ":") # 评价人数 good_commit = good.find_element_by_class_name( 'p-commit').text.replace("\n", " ") good_content = f''' 商品链接: {good_url} 商品名称: {good_name} 商品价格: {good_price} 评价人数: {good_commit} \n ''' print(good_content) with open('jd.txt', 'a', encoding='utf-8') as f: f.write(good_content) next_tag = driver.find_element_by_class_name('pn-next') next_tag.click() time.sleep(2) # 递归调用函数 get_good(driver) time.sleep(10) finally: driver.close()if __name__ == '__main__': good_name = input('请输入爬取商品信息:').strip() driver = webdriver.Chrome() driver.implicitly_wait(10) # 1、往京东主页发送请求 driver.get('https://www.jd.com/') # 2、输入商品名称,并回车搜索 input_tag = driver.find_element_by_id('key') input_tag.send_keys(good_name) input_tag.send_keys(Keys.ENTER) time.sleep(2) get_good(driver)