去除网页的安全验证
1 2 3 4 5
| import requests domain = 'https://www.dytt89.com/' resp = requests.get(domain,verify=False) print(resp.text)
|
解码
这个网页是使用gb2312
编码,我们需要指定字符集
1 2 3 4 5 6
| import requests domain = 'https://www.dytt89.com/' resp = requests.get(domain,verify=False) resp.encoding = 'gb2312' print(resp.text)
|
分析数据
1 2 3 4 5 6 7 8 9 10
| domain = 'https://www.dytt89.com/' resp = requests.get(domain,verify=False) resp.encoding = 'gb2312'
obj1 = re.compile(r'2022必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S) result1 = obj1.finditer(resp.text) for it in result1: print(it.group('ul')) resp.clos
|
获取子页面的URL
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| domain = 'https://www.dytt89.com/' resp = requests.get(domain,verify=False) resp.encoding = 'gb2312'
obj1 = re.compile(r'2022必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S) result1 = obj1.finditer(resp.text) child_lst = []
for it in result1: uls =it.group('ul') result2 = obj2.finditer(uls) for itt in result2:
child_href = domain + itt.group('href') child_lst.append(child_href)
|
提取子页面
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| domain = 'https://www.dytt89.com/' resp = requests.get(domain,verify=False) resp.encoding = 'gb2312'
obj1 = re.compile(r'2022必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1>' r'.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">' r'<a href="(?P<download>.*?)">',re.S) result1 = obj1.finditer(resp.text) child_lst = []
for it in result1: uls =it.group('ul') result2 = obj2.finditer(uls) for itt in result2: child_href = domain + itt.group('href') child_lst.append(child_href)
for href in child_lst: child_resp= requests.get(href,verify=False) child_resp.encoding = 'gb2312' result3 = obj3.search(child_resp.text) print(result3.group('movie')) print(result3.group('download')) resp.close()
|