456电影网:爬取某电影网站(未写完)

1 import requests 2 import bs4 3 import lxml 4 import re 5 import time 6 from bs4 import BeautifulSoup 7 #网站 8 url = 'https://www.88ys.cc' 9 #电影或电视剧的名字 10 film = '家有女友' 11 #代理ip 12 proxy='120.24.245.33:16818'#已过期,需续费 13 proxies = { 14 'http':'http://'+proxy, 15 'https':'https://'+proxy 16 } 17 #gzip访问速度更快 18 headers = { 19 "Accept-Encoding": "gzip" 20 } 21 ####搜索结果#### 22 def search(): 23 ####搜索结果#### 24 #搜索链接 25 url_search = url + '/index.php?m=vod-search' 26 #post需要提交的参数 27 data = { 28 'wd':film , 29 'submit':'' 30 } 31 #提交搜索内容的表单 32 #①无代理 33 r_s = requests.post(url_search, data=data) 34 #②有代理 35 # r_s = requests.post(url_search, data=data, proxies=proxies, headers=headers) 36 #设置编码 37 r_s.encoding = 'utf-8' 38 #接收返回的网页 39 text_s = r_s.text 40 # 41 pat = re.compile(r'<a class="link-hover" href="(.*?)"') 42 pat_is = re.findall(pat, text_s) 43 # print(pat_is) 44 return pat_is 45 ####获取集数#### 46 def List(pat_search): 47 ####获取集数#### 48 #搜索结果链接 49 url_list = url + pat_search 50 #打开合并的链接 51 # r_list = requests.get(url_list, proxies=proxies, headers=headers) 52 r_list = requests.get(url_list) 53 #设置字符编码 54 r_list.encoding = 'utf-8' 55 #接收链接网页 56 text_list = r_list.text 57 # print(text_list) 58 #使用BeautifulSoup获取第一个片源的所有集数链接 59 text_l_b = BeautifulSoup(text_list,'lxml') 60 stab81 = text_l_b.find_all(name='div', attrs={'id':'stab81'}) 61 stab81_re = re.findall(re.compile(r'href="(.*?)"'), str(stab81[0])) 62 # print(stab81_re) 63 return stab81_re 64 ####搜索结果文字信息#### 65 def search_news(pat_search): 66 ####搜索结果文字信息#### 67 #存储获取的信息以集合形式返回 68 #[0]电影名[1]影片类型[2]语言 69 information = [] 70 #建立连接 71 url_search = url + pat_search 72 r = requests.get(url_search) 73 #设置编码 74 r.encoding = 'utf-8' 75 #获取电影信息的div 76 bs = str(BeautifulSoup(r.text, 'lxml').find_all('div', class_='ct-c')) 77 #获取电影名称,并添加到集合 78 h1_bs = BeautifulSoup(bs,'lxml') 79 h1_re = re.findall(re.compile(r'>(.*?)<'), str(h1_bs.h1)) 80 information.append(h1_re[0]) 81 #类型 82 type_re = re.findall(re.compile(r'类型:</span>(.*?)</dd>'), bs) 83 information.append(type_re[0]) 84 #语言 85 language_re = re.findall(re.compile(r'语言:</span>(.*?)</dd>'), bs) 86 information.append(language_re[0]) 87 # print(information) 88 return information 89 ####Main函数(循环获取搜索结果和集数)#### 90 def _for_(): 91 ####循环获取搜索结果和集数#### 92 #调用搜索 93 pat_search = search() 94 #接收影片返回信息 95 information = None 96 #接收片源一返回的信息 97 stab81 = None 98 # 循环调用方法 99 for i in pat_search:100 #调用方法并接收101 information = search_news(i)102 #调用方法并接收103 stab81 = List(i)104 #打印105 print(information)106 #打印107 print(stab81)108 #使用延时防止运行太快被网站强制断开连接109 time.sleep(3)110 # 调用主(Main)函数111 _for_()112 ####未做完,查看器和爬取源码不一致####113 def a():114 _url_ = url + '/vod-play-id-56106-src-1-num-1.html'115 r = requests.get(_url_)116 r.encoding = 'utf-8'117 bs = BeautifulSoup(r.text,'lxml')118 print(bs.prettify())

(未解决问题)

1.查看器和爬取源码不一致

2.获取视频链接

3.电影下载操作

转载于:https://www.cnblogs.com/Ly-233/p/11205661.html

相关推荐

相关文章