分类
编程

python3登录某网站后爬取网站数据库信息

一、引用模块

import requests
from bs4 import BeautifulSoup

request模块是进行post登录

beautifulsoup模块对网页进行解析

二、登录获取cookie

#1、登录后获取cookie
def Get_Cookie(url):
    session = requests.Session()
    headers = {
        'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
    }
   
    req = session.post(url, headers=headers)
    
    #获取当前的Cookie
    Cookie= dict(session.cookies)
    # print(Cookie)
    return Cookie

三、获取登录后的cookie进行一些操作

#2、获取登录cookie后进行一些操作
def Get_Shop_Lise():
    #初始化字典
    result = []
    #获取cookie
    cookies = Get_Cookie("http://XXX.com")#登录网址
    #构造header(根据爬取网站根据实际场景更改)
    header = {
    "Accept":"text/plain, */*; q=0.01",
    "Accept-Encoding":"gzip, deflate",
    "Accept-Language":"zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6",
    "Connection":"keep-alive",
    "Host":"XXXX.com", #这里的host自行更改
    "Referer":"/index.htm", #这里的referer自行更改
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
    }
    get_url = 'http://xxx.com' #登录后访问的网址
    res = requests.post(get_url,headers=header,cookies=cookies)
    soup = BeautifulSoup(res.text,'lxml')
    for tr in soup.find_all('tr'): 
        td = tr.find('td') 
        if td: 
            #print(td.getText())
            result.append(td.getText())       
    return result