Python爬虫之pyquery库的基本使用

2018-11-29 09:47:31来源:博客园 阅读 ()

新老客户大回馈,云服务器低至5折

  1 # 字符串初始化
  2 html = '''
  3 <div>
  4     <ul>
  5         <li class = "item-0">first item</li>
  6         <li class = "item-1"><a href = "link2.html">second item</a></li>
  7         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
  8         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
  9         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 10     </ul>
 11 </div>
 12 '''
 13 
 14 from pyquery import PyQuery as pq
 15 doc = pq(html)
 16 print(doc('li'))
 17 
 18 # url初始化
 19 from pyquery import  PyQuery as pq
 20 doc = pq(url = "http://www.baidu.com")
 21 print(doc("head"))
 22 
 23 # 文件初始化
 24 from pyquery import PyQuery as pq
 25 doc = pq(filename = "demo.html")
 26 print(doc('li'))
 27 
 28 # 基本CSS选择器
 29 html = '''
 30 <div id = "container">
 31     <ul class = "list">
 32         <li class = "item-0">first item</li>
 33         <li class = "item-1"><a href = "link2.html">second item</a></li>
 34         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 35         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 36         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 37     </ul>
 38 </div>
 39 '''
 40 from pyquery import PyQuery as pq
 41 doc = pq(html)
 42 # 注意下面id 前面需要加上#,class 前面需要加上.
 43 print(doc('#container .list li'))
 44 
 45 # 查找元素
 46 # 子元素
 47 html = '''
 48 <div id = "container">
 49     <ul class = "list">
 50         <li class = "item-0">first item</li>
 51         <li class = "item-1"><a href = "link2.html">second item</a></li>
 52         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 53         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 54         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 55     </ul>
 56 </div>
 57 '''
 58 from pyquery import PyQuery as pq
 59 doc = pq(html)
 60 items = doc('.list')
 61 print(type(items))
 62 print(items)
 63 lis = items.find('li')
 64 print(type(lis))
 65 print(lis)
 66 
 67 lis = items.children()
 68 print(type(lis))
 69 print(lis)
 70 
 71 lis = items.children('.active')
 72 print(lis)
 73 
 74 # 父元素
 75 html = '''
 76 <div id = "container">
 77     <ul class = "list">
 78         <li class = "item-0">first item</li>
 79         <li class = "item-1"><a href = "link2.html">second item</a></li>
 80         <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 81         <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 82         <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 83     </ul>
 84 </div>
 85 '''
 86 from pyquery import PyQuery as pq
 87 doc = pq(html)
 88 items = doc('.list')
 89 container = items.parent()
 90 print(type(container))
 91 print(container)
 92 
 93 html = '''
 94 <div class = "wrap">
 95     <div id = "container">
 96         <ul class = "list">
 97             <li class = "item-0">first item</li>
 98             <li class = "item-1"><a href = "link2.html">second item</a></li>
 99             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
100             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
101             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
102         </ul>
103     </div>
104 </div>
105 '''
106 from pyquery import PyQuery as pq
107 doc = pq(html)
108 items = doc('.list')
109 parents = items.parents()
110 print(type(parents))
111 print(parents)
112 
113 parents = items.parents('.wrap')
114 print(parents)
  1 # 兄弟元素
  2 html = '''
  3 <div class = "wrap">
  4     <div id = "container">
  5         <ul class = "list">
  6             <li class = "item-0">first item</li>
  7             <li class = "item-1"><a href = "link2.html">second item</a></li>
  8             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
  9             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 10             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 11         </ul>
 12     </div>
 13 </div>
 14 '''
 15 from pyquery import PyQuery as pq
 16 doc = pq(html)
 17 # 注意下面item-0后面直接是. 没有空格
 18 li = doc('.list .item-0.active')
 19 print(li.siblings())
 20 
 21 print(li.siblings('.active'))
 22 
 23 # 遍历
 24 # 单个元素
 25 html = '''
 26 <div class = "wrap">
 27     <div id = "container">
 28         <ul class = "list">
 29             <li class = "item-0">first item</li>
 30             <li class = "item-1"><a href = "link2.html">second item</a></li>
 31             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 32             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 33             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 34         </ul>
 35     </div>
 36 </div>
 37 '''
 38 from pyquery import PyQuery as pq
 39 doc = pq(html)
 40 li = doc('.item-0.active')
 41 print(li)
 42 
 43 html = '''
 44 <div class = "wrap">
 45     <div id = "container">
 46         <ul class = "list">
 47             <li class = "item-0">first item</li>
 48             <li class = "item-1"><a href = "link2.html">second item</a></li>
 49             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 50             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 51             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 52         </ul>
 53     </div>
 54 </div>
 55 '''
 56 from pyquery import PyQuery as pq
 57 doc = pq(html)
 58 lis = doc('li').items()
 59 print(type(lis))
 60 for li in lis:
 61     print(li)
 62 
 63 # 获取信息
 64 # 获取属性
 65 html = '''
 66 <div class = "wrap">
 67     <div id = "container">
 68         <ul class = "list">
 69             <li class = "item-0">first item</li>
 70             <li class = "item-1"><a href = "link2.html">second item</a></li>
 71             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
 72             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
 73             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
 74         </ul>
 75     </div>
 76 </div>
 77 '''
 78 from pyquery import PyQuery as pq
 79 doc = pq(html)
 80 a = doc('.item-0.active a')
 81 print(a)
 82 # 获取属性的两种方法
 83 print(a.attr('href'))
 84 print(a.attr.href)
 85 
 86 # 获取文本
 87 print(a.text())
 88 
 89 # 获取html
 90 from pyquery import PyQuery as pq
 91 doc = pq(html)
 92 li = doc('.item-0.active')
 93 print(li)
 94 # 得到<li>标签里面的代码
 95 print(li.html())
 96 
 97 # DOM操作
 98 # addClass、removeClass
 99 from pyquery import PyQuery as pq
100 doc = pq(html)
101 li = doc('.item-0.active')
102 print(li)
103 li.remove_class('active')
104 print(li)
105 li.add_class('active')
106 print(li)
107 
108 # attr CSS
109 li.attr('name', 'link')
110 print(li)
111 li.css('font-size', '14px')
112 print(li)
113 
114 # remove
115 html = '''
116 <div class = "wrap">
117     Hello,World
118     <p>This is a paragraph</p>
119 </div>
120 '''
121 from pyquery import PyQuery as pq
122 doc = pq(html)
123 wrap = doc('.wrap')
124 print(wrap.text())
125 wrap.find('p').remove()
126 print(wrap.text())
127 
128 # 伪类选择器
129 html = '''
130 <div class = "wrap">
131     <div id = "container">
132         <ul class = "list">
133             <li class = "item-0">first item</li>
134             <li class = "item-1"><a href = "link2.html">second item</a></li>
135             <li class = "item-0 active"><a href = "link3.html"><span class = "bold">third item</span></a></li>
136             <li class = "item-1 active"><a href = "link4.html">fourth item</a></li>
137             <li class = "item-0"><a href = "link5.html">fifthth item</a></li>
138         </ul>
139     </div>
140 </div>
141 '''
142 from pyquery import PyQuery as pq
143 doc = pq(html)
144 # 获取第一个元素
145 li = doc('li:first-child')
146 print(li)
147 # 获取最后一个元素
148 li = doc('li:last-child')
149 print(li)
150 # 获取第二个元素
151 li = doc('li:nth-child(2)')
152 print(li)
153 # 获取下标为2的元素后面的所有元素(下标从0开始)
154 li = doc('li:gt(2)')
155 print(li)
156 # 获取下标为偶数的元素
157 li = doc('li:nth-child(2n)')
158 print(li)
159 # 获取内容包含second 的元素
160 li = doc('li:contains(second)')
161 print(li)

 

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有

上一篇:Python爬虫之Beautiful Soup库的基本使用

下一篇:Python:logging.NullHandler 的使用